Load the Libraries

pacman::p_load(psych, xray , ggplot2, texreg, DT, wrapr, dplyr,
               sjmisc, sjlabelled, sjstats, sjPlot, tidyr,ggpubr,corrplot,
               knitr, kableExtra, captioner, car, excelR,ggcorrplot,Rmisc,sjmisc,funModeling,
               forcats, hrbrthemes,tidyverse,finalfit,patchwork,GGally,broom,caret,matrixTests,PerformanceAnalytics)

Connect the Dataset

df = read.csv("Well_being_Health_Behavior_Environmental_Fact.csv")

First Glance of the Data set

Head View of the Data Set

pacman::p_load("DT")
head(x = df,n=5) %>% datatable(rownames=TRUE,filter = "top", options=list(pageLength = 10, scrollX=T))

Tail View of the Dataset

pacman::p_load("DT")
tail(x = df,n=5) %>% datatable(rownames=TRUE,filter = "top", options=list(pageLength = 10, scrollX=T))

Head Tail View of the DataSet

pacman::p_load("DT","psych")
headTail(x = df,top = 4,bottom = 4) %>% datatable(rownames=TRUE,filter = "top", options=list(pageLength = 10, scrollX=T,scrollY=T))

Scrutinizing the Data

3S of the Dataset

Size of the Dataset

cat("The Dataset (df) contains ",nrow(df)," records and", ncol(df)," Columns or variables.")
## The Dataset (df) contains  395697  records and 33  Columns or variables.

Structure of the Data Set

str(df)
## 'data.frame':    395697 obs. of  33 variables:
##  $ X                     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age                   : int  30 64 49 43 68 38 23 61 78 82 ...
##  $ wellbeing             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SocialSupport         : int  5 4 3 5 4 4 5 3 5 3 ...
##  $ race                  : int  0 1 1 1 1 1 1 1 1 0 ...
##  $ income                : int  5 5 1 5 2 5 5 NA NA NA ...
##  $ GeneralHealth         : int  4 4 5 2 5 4 1 3 3 3 ...
##  $ PoorPhysicalHealthDays: int  0 5 NA 0 30 0 0 0 4 0 ...
##  $ PoorMentalHealthDays  : int  0 0 0 0 3 0 0 0 0 0 ...
##  $ Asthma                : int  0 1 1 0 1 0 0 0 0 0 ...
##  $ HealthInsurance       : int  1 1 1 1 0 1 1 1 1 1 ...
##  $ CVD                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ LimitedActivity       : int  0 1 1 0 1 1 0 0 0 0 ...
##  $ Diabetes              : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ employ                : int  1 7 8 1 8 1 1 7 7 5 ...
##  $ BMI                   : int  3 3 3 2 1 2 2 3 2 1 ...
##  $ HeavyDrinker          : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ CurrentSmoker         : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ PoorSleepDays         : int  1 30 0 0 14 30 0 2 0 2 ...
##  $ PhysicalActivity      : int  1 0 0 1 0 0 1 1 1 0 ...
##  $ marital               : int  1 1 0 1 1 1 1 1 1 0 ...
##  $ PrematureMortality    : num  440 440 440 440 440 ...
##  $ HouseholdIncome       : int  48863 48863 48863 48863 48863 48863 48863 48863 48863 48863 ...
##  $ ParkAccess            : int  20 20 20 20 20 20 20 20 20 20 ...
##  $ CrimeRate             : int  300 300 300 300 300 300 300 300 300 300 ...
##  $ UnemploymentRate      : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ WaterSafety           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HighSchoolRate        : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ SomeCollegeRate       : num  54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 ...
##  $ AccessToRecFacility   : num  7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 ...
##  $ FastFoodPercentage    : int  52 52 52 52 52 52 52 52 52 52 ...
##  $ PM2.5                 : num  13.1 13.1 13.1 13.1 13.1 ...
##  $ PopulationDensity     : num  91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 ...




Interpretation from the Structure of the Dataset :



  1. The Dataset originally has only numerical and integer values

  2. Our Target variable “wellbeing” is also an integer datatype. However, we expect that into a category as we conceive it as classification thing of Wellbeing in the form of category as (yes or no) or (good or bad).
    2.Action : Convert “Wellbeing” into a factor

  3. It is also evident by looking at the structure of the dataset that several other variables like (Asthma, HealthInsurance, CVD, Diabetes, HeavyDrinker) to name a few are supposed to be categories or factors but infact they are integers.
    3.Action : Convert those set of Categorical Variables as Factors from Integers.

  4. Some set of variables are of the perfect type for example - Age is an integer and is supposed to be one. PM2.5 which represents the Fine Particle Matter in micrograms/cubic meter is actually a numerical data in a continous form.

  5. Some Variables such as BMI and income were expected to be continuous. However, they are ordinal values mentioned in integers which is a point of concern

5. Action : Convert those variables into probable factors whenver needed in Analytics

Summary of the DataSet

summary(df)
##        X               age          wellbeing     SocialSupport  
##  Min.   :     1   Min.   : 7.00   Min.   :0.000   Min.   :1.000  
##  1st Qu.: 98925   1st Qu.:45.00   1st Qu.:0.000   1st Qu.:4.000  
##  Median :197849   Median :57.00   Median :0.000   Median :5.000  
##  Mean   :197849   Mean   :56.32   Mean   :0.055   Mean   :4.181  
##  3rd Qu.:296773   3rd Qu.:69.00   3rd Qu.:0.000   3rd Qu.:5.000  
##  Max.   :395697   Max.   :99.00   Max.   :1.000   Max.   :5.000  
##                                   NA's   :17025   NA's   :20998  
##       race           income      GeneralHealth   PoorPhysicalHealthDays
##  Min.   :0.000   Min.   :1.00    Min.   :1.000   Min.   : 0.000        
##  1st Qu.:1.000   1st Qu.:2.00    1st Qu.:2.000   1st Qu.: 0.000        
##  Median :1.000   Median :4.00    Median :3.000   Median : 0.000        
##  Mean   :0.806   Mean   :3.61    Mean   :2.586   Mean   : 4.472        
##  3rd Qu.:1.000   3rd Qu.:5.00    3rd Qu.:3.000   3rd Qu.: 3.000        
##  Max.   :1.000   Max.   :5.00    Max.   :5.000   Max.   :30.000        
##  NA's   :5309    NA's   :54657   NA's   :1555    NA's   :9341          
##  PoorMentalHealthDays     Asthma       HealthInsurance       CVD       
##  Min.   : 0.000       Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.: 0.000       1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.000  
##  Median : 0.000       Median :0.0000   Median :1.0000   Median :0.000  
##  Mean   : 3.497       Mean   :0.0936   Mean   :0.8955   Mean   :0.067  
##  3rd Qu.: 2.000       3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.000  
##  Max.   :30.000       Max.   :1.0000   Max.   :1.0000   Max.   :1.000  
##  NA's   :7361         NA's   :2590     NA's   :991      NA's   :3928   
##  LimitedActivity     Diabetes          employ           BMI       
##  Min.   :0.0000   Min.   :0.0000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :0.0000   Median :0.0000   Median :3.000   Median :2.000  
##  Mean   :0.2723   Mean   :0.1274   Mean   :3.885   Mean   :1.933  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:7.000   3rd Qu.:3.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :8.000   Max.   :3.000  
##  NA's   :1858     NA's   :395      NA's   :1322    NA's   :17290  
##   HeavyDrinker   CurrentSmoker    PoorSleepDays    PhysicalActivity
##  Min.   :0.000   Min.   :0.0000   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.: 0.000   1st Qu.:0.0000  
##  Median :0.000   Median :0.0000   Median : 3.000   Median :1.0000  
##  Mean   :0.046   Mean   :0.1583   Mean   : 7.707   Mean   :0.7303  
##  3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:12.000   3rd Qu.:1.0000  
##  Max.   :1.000   Max.   :1.0000   Max.   :30.000   Max.   :1.0000  
##  NA's   :10742   NA's   :2463     NA's   :7560     NA's   :533     
##     marital       PrematureMortality HouseholdIncome    ParkAccess    
##  Min.   :0.0000   Min.   :133.5      Min.   : 22289   Min.   :  1.00  
##  1st Qu.:0.0000   1st Qu.:294.0      1st Qu.: 41103   1st Qu.: 13.00  
##  Median :1.0000   Median :345.0      Median : 47930   Median : 32.00  
##  Mean   :0.5552   Mean   :355.8      Mean   : 50177   Mean   : 34.58  
##  3rd Qu.:1.0000   3rd Qu.:405.7      3rd Qu.: 56166   3rd Qu.: 52.00  
##  Max.   :1.0000   Max.   :883.5      Max.   :119525   Max.   :100.00  
##  NA's   :1559                                         NA's   :9633    
##    CrimeRate      UnemploymentRate  WaterSafety      HighSchoolRate 
##  Min.   :  12.0   Min.   : 1.100   Min.   :  0.000   Min.   : 27.0  
##  1st Qu.: 202.0   1st Qu.: 7.000   1st Qu.:  0.000   1st Qu.: 74.0  
##  Median : 343.0   Median : 8.400   Median :  1.000   Median : 80.0  
##  Mean   : 405.6   Mean   : 8.716   Mean   :  6.511   Mean   : 79.1  
##  3rd Qu.: 558.0   3rd Qu.:10.200   3rd Qu.:  5.000   3rd Qu.: 86.0  
##  Max.   :2062.0   Max.   :29.700   Max.   :100.000   Max.   :100.0  
##  NA's   :4174                      NA's   :5039      NA's   :16     
##  SomeCollegeRate AccessToRecFacility FastFoodPercentage     PM2.5      
##  Min.   :23.20   Min.   : 0.00       Min.   :  8.00     Min.   : 6.50  
##  1st Qu.:54.00   1st Qu.: 6.90       1st Qu.: 44.00     1st Qu.:10.86  
##  Median :61.20   Median : 9.80       Median : 50.00     Median :11.74  
##  Mean   :60.45   Mean   :10.13       Mean   : 48.33     Mean   :11.63  
##  3rd Qu.:67.70   3rd Qu.:13.00       3rd Qu.: 54.00     3rd Qu.:12.78  
##  Max.   :90.40   Max.   :57.50       Max.   :100.00     Max.   :14.78  
##                                      NA's   :42                        
##  PopulationDensity
##  Min.   :    1.7  
##  1st Qu.:   66.3  
##  Median :  276.9  
##  Mean   : 1186.5  
##  3rd Qu.:  991.3  
##  Max.   :69468.4  
## 




Interpretation from the Structure of the Dataset :



  1. The Summary provides a good support of evidence for those points of concerns expressed earlier in the structure of the dataset. It describes the standard statistics for those variables which are supposed to be categorical variables either ordinal or nominal

  2. The Summary indicates something interesting that many records and most variables in the dataset contains null values.

  3. Action : Either Remove or Impute Null Values ensuring that in each of the case does not statistically significantly alters the dataset.

  4. The Good part that the summary iterates is that the almost all variables are within their supposed range. For an instance, no age record is negative, No PM2.5 less than 0, Household income min is not negative and so on. This is a sigh of relief.

Below are some boxplots for concerning continuous and discrete variables demonstrating their ranges with outliers

Boxplots For Range Check and Outlier Detection

Age Boxplot

This boxplot for age seems very healthy.

Premature Mortality Boxplot

boxplot(df["PrematureMortality"],main = "Premature Mortality Box Plot", xlab = "Premature Mortality",horizontal = TRUE)

The distribution of the Premature Mortality have some outliers in the right ends shows that it is skewed to the right.

Checking Distribution of Premature Mortality

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=PrematureMortality)) + geom_density()+
 ggtitle ("Premature Mortality Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

From the above test, we could possibly reject the null hypothesis that the distribution of premature Mortality is normal. However through Central Limit Theorem I shall prove the normality.

s30 <- c()
s50 <- c()
s500 <- c()
n =396000
for ( i in 1:n){
s30[i] = mean(sample(df$PrematureMortality,30, replace = TRUE))
s50[i] = mean(sample(df$PrematureMortality,50, replace = TRUE))
s500[i] = mean(sample(df$PrematureMortality,500, replace = TRUE))
}
par(mfrow=c(1,3))
hist(s30, col ="lightblue",main="Sample size=30",xlab ="Premature Mortality")
abline(v = mean(s30), col = "red")

hist(s50, col ="lightgreen", main="Sample size=50",xlab ="Premature Mortality")
abline(v = mean(s50), col = "red")

hist(s500, col ="orange",main="Sample size=500",xlab ="Premature Mortality")
abline(v = mean(s500), col = "red")

Interpretation

We could clearly see a bell curve with Central Limit Theorem and prove that with sufficiently large sample size any distribution that does not seem to be normally distributed gets normally distributed using Central Limit Theorem or Bootstrapping.

Household Income Boxplot

boxplot(df["HouseholdIncome"],main = "Household Income Box Plot", xlab = "Household Income",horizontal = TRUE)

The distribution of the Household Income have some outliers in the right ends shows that it is skewed to the right.

Checking Distribution of Household Income

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=HouseholdIncome)) + geom_density()+
 ggtitle ("Household Income Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.


So from now on we shall not Check with the CLT or Bootstrapping for the normal distribution of any variable rather assume that the distribution will be normal whenever viewed from the lens of CLT or Bootstrapping.

Unemployment Rate Boxplot

boxplot(df["UnemploymentRate"],main = "Unemployment Rate   Box Plot", xlab = "Unemployment Rate  ",horizontal = TRUE)

The distribution of the Unemployment Rate have some outliers on both ends more probably on the right which signifies that it is skewed more to the right.

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=UnemploymentRate)) + geom_density()+
 ggtitle ("Unemployment Rate Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

High School Rate Boxplot

boxplot(df["HighSchoolRate"],main = "High School Rate Box Plot", xlab = "High School Rate",horizontal = TRUE)

The distribution of the High School Rate have some outliers on left ends which signifies that it is skewed more to the left.

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=HighSchoolRate)) + geom_density()+
 ggtitle ("High School Rate Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 
## Warning: Removed 16 rows containing non-finite values (`stat_density()`).

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

Some College Rate Boxplot

boxplot(df["SomeCollegeRate"],main = "Some College Rate Box Plot", xlab = "Some College Rate  ",horizontal = TRUE)

The distribution of the Some College Rate have some outliers on left ends which signifies that it is skewed more to the left.

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=SomeCollegeRate)) + geom_density()+
 ggtitle ("Some College Rate Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

Access To Recreational Facility Boxplot

boxplot(df["AccessToRecFacility"],main = "Access To Recreational Facility Box Plot", xlab = "Access To Recreational Facility  ",horizontal = TRUE)

The distribution of the Access To Recreational Facility have some outliers on right ends which signifies that it is skewed more to the left.

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=AccessToRecFacility)) + geom_density()+
 ggtitle ("Access To Recreational Facility Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

Fast Food Percentage Boxplot

boxplot(df["FastFoodPercentage"],main = "Fast Food Percentage Box Plot", xlab = "Fast Food Percentage  ",horizontal = TRUE)

The distribution of the Fast Food Percentage have some outliers on both ends

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=FastFoodPercentage)) + geom_density()+
 ggtitle ("Fast Food Percentage Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 
## Warning: Removed 42 rows containing non-finite values (`stat_density()`).

The distribution seems normal with multiple peaks and higher kurtosis

PM2.5 Boxplot

boxplot(df["PM2.5"],main = "PM2.5 Box Plot", xlab = "PM2.5  ",horizontal = TRUE)

The distribution of the PM2.5 have some outliers on left ends which signifies that it is skewed more to the left.

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=PM2.5)) + geom_density()+
 ggtitle ("PM2.5 Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

Population Density Boxplot

boxplot(df["PopulationDensity"],main = "Population Density Box Plot", xlab = "Population Density  ",horizontal = TRUE)

The distribution of the Population Density have some outliers on right ends which signifies that it is skewed more to the right

Checking Distribution of Unemployment Rate

pacman::p_load("ggplot2")
ggplot(data=df,aes(x=PopulationDensity)) + geom_density()+
 ggtitle ("Population Density Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) ) 

The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.

Interpretation of Boxplots for every variable



As far as continuous variables are concerned their distribution is quite questionable however, with Central Limit Theorem it could be deduced that they are representative of their population.
Also the boxplot help us indicates that the variables are in their expected range and have acceptable set of outliers and shall help us validate the min, max, median and range of those variables.

Inspect the presence and purview of null values

Checking the count of null values by Variable

apply(apply(df,2,is.na),2,sum)
##                      X                    age              wellbeing 
##                      0                      0                  17025 
##          SocialSupport                   race                 income 
##                  20998                   5309                  54657 
##          GeneralHealth PoorPhysicalHealthDays   PoorMentalHealthDays 
##                   1555                   9341                   7361 
##                 Asthma        HealthInsurance                    CVD 
##                   2590                    991                   3928 
##        LimitedActivity               Diabetes                 employ 
##                   1858                    395                   1322 
##                    BMI           HeavyDrinker          CurrentSmoker 
##                  17290                  10742                   2463 
##          PoorSleepDays       PhysicalActivity                marital 
##                   7560                    533                   1559 
##     PrematureMortality        HouseholdIncome             ParkAccess 
##                      0                      0                   9633 
##              CrimeRate       UnemploymentRate            WaterSafety 
##                   4174                      0                   5039 
##         HighSchoolRate        SomeCollegeRate    AccessToRecFacility 
##                     16                      0                      0 
##     FastFoodPercentage                  PM2.5      PopulationDensity 
##                     42                      0                      0

Plotting the null values by variable

library(tidyr)
pacman::p_load(inspectdf)
df %>% inspect_na %>% show_plot

Check the rows with null values

sum(apply(df,1,anyNA))
## [1] 119348
cat("There are around ",sum(apply(df,1,anyNA)), " null records in the dataset of in total", nrow(df),"records \n","In case we delete all of the", sum(apply(df,1,anyNA)),"records with missing values then we are actually losing around", 100*(sum(apply(df,1,anyNA))/nrow(df)),"% of the total dataset.") 
## There are around  119348  null records in the dataset of in total 395697 records 
##  In case we delete all of the 119348 records with missing values then we are actually losing around 30.16146 % of the total dataset.

Data Preparation

Address the Null Value Problem

Approach - I Deletion of All Null Values

df.omit <- na.omit(df)
pacman::p_load(psych,DT)
headTail(df.omit, 10, 10) %>% datatable( rownames = FALSE, filter="top", options = list(pageLength = 21, scrollX=T))
cat(nrow(df.omit),ncol(df.omit))
## 276349 33
#write.csv(df.omit,file="df.omit.csv")

Checking For Statistical Significance Difference Pre-Post Deletion of NULL Values

Due to large Sample size everything starts to make sense which can be seen by the p-value. Even after it shows statistically significant difference in the dataset before and after deleting null values. We tend to ignore the statistical evidence and stick to the fact that we have quite enough sample size to represent the population as a whole.

Recoding and Factorizing Variables

str(df.omit)
## 'data.frame':    276349 obs. of  33 variables:
##  $ X                     : int  1 2 4 5 6 7 12 13 14 15 ...
##  $ age                   : int  30 64 43 68 38 23 44 64 75 70 ...
##  $ wellbeing             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SocialSupport         : int  5 4 5 4 4 5 4 3 1 5 ...
##  $ race                  : int  0 1 1 1 1 1 1 0 1 1 ...
##  $ income                : int  5 5 5 2 5 5 5 3 3 4 ...
##  $ GeneralHealth         : int  4 4 2 5 4 1 5 4 1 2 ...
##  $ PoorPhysicalHealthDays: int  0 5 0 30 0 0 21 3 0 0 ...
##  $ PoorMentalHealthDays  : int  0 0 0 3 0 0 14 2 0 0 ...
##  $ Asthma                : int  0 1 0 1 0 0 1 0 0 0 ...
##  $ HealthInsurance       : int  1 1 1 0 1 1 1 1 1 1 ...
##  $ CVD                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ LimitedActivity       : int  0 1 0 1 1 0 1 0 0 0 ...
##  $ Diabetes              : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ employ                : int  1 7 1 8 1 1 7 7 7 7 ...
##  $ BMI                   : int  3 3 2 1 2 2 2 2 2 2 ...
##  $ HeavyDrinker          : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ CurrentSmoker         : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ PoorSleepDays         : int  1 30 0 14 30 0 15 10 0 15 ...
##  $ PhysicalActivity      : int  1 0 1 0 0 1 0 1 1 1 ...
##  $ marital               : int  1 1 1 1 1 1 0 1 0 0 ...
##  $ PrematureMortality    : num  440 440 440 440 440 ...
##  $ HouseholdIncome       : int  48863 48863 48863 48863 48863 48863 48863 48863 48863 48863 ...
##  $ ParkAccess            : int  20 20 20 20 20 20 20 20 20 20 ...
##  $ CrimeRate             : int  300 300 300 300 300 300 300 300 300 300 ...
##  $ UnemploymentRate      : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ WaterSafety           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HighSchoolRate        : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ SomeCollegeRate       : num  54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 ...
##  $ AccessToRecFacility   : num  7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 ...
##  $ FastFoodPercentage    : int  52 52 52 52 52 52 52 52 52 52 ...
##  $ PM2.5                 : num  13.1 13.1 13.1 13.1 13.1 ...
##  $ PopulationDensity     : num  91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 ...
##  - attr(*, "na.action")= 'omit' Named int [1:119348] 3 8 9 10 11 22 26 27 29 32 ...
##   ..- attr(*, "names")= chr [1:119348] "3" "8" "9" "10" ...

Identifying the Variables Requires Factorization and Recoding

Factorizing

variables_recoded = c("wellbeing","SocialSupport","race","income","GeneralHealth","Asthma","HealthInsurance","CVD","LimitedActivity","Diabetes","employ","BMI","HeavyDrinker","CurrentSmoker","PhysicalActivity","marital")
for (i in variables_recoded){
  df.omit[,i] <- as.factor(df.omit[,i])
}

Checking the Status

str(df.omit)
## 'data.frame':    276349 obs. of  33 variables:
##  $ X                     : int  1 2 4 5 6 7 12 13 14 15 ...
##  $ age                   : int  30 64 43 68 38 23 44 64 75 70 ...
##  $ wellbeing             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SocialSupport         : Factor w/ 5 levels "1","2","3","4",..: 5 4 5 4 4 5 4 3 1 5 ...
##  $ race                  : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 1 2 2 ...
##  $ income                : Factor w/ 5 levels "1","2","3","4",..: 5 5 5 2 5 5 5 3 3 4 ...
##  $ GeneralHealth         : Factor w/ 5 levels "1","2","3","4",..: 4 4 2 5 4 1 5 4 1 2 ...
##  $ PoorPhysicalHealthDays: int  0 5 0 30 0 0 21 3 0 0 ...
##  $ PoorMentalHealthDays  : int  0 0 0 3 0 0 14 2 0 0 ...
##  $ Asthma                : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 2 1 1 1 ...
##  $ HealthInsurance       : Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 2 2 ...
##  $ CVD                   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ LimitedActivity       : Factor w/ 2 levels "0","1": 1 2 1 2 2 1 2 1 1 1 ...
##  $ Diabetes              : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
##  $ employ                : Factor w/ 8 levels "1","2","3","4",..: 1 7 1 8 1 1 7 7 7 7 ...
##  $ BMI                   : Factor w/ 3 levels "1","2","3": 3 3 2 1 2 2 2 2 2 2 ...
##  $ HeavyDrinker          : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 1 ...
##  $ CurrentSmoker         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
##  $ PoorSleepDays         : int  1 30 0 14 30 0 15 10 0 15 ...
##  $ PhysicalActivity      : Factor w/ 2 levels "0","1": 2 1 2 1 1 2 1 2 2 2 ...
##  $ marital               : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 1 2 1 1 ...
##  $ PrematureMortality    : num  440 440 440 440 440 ...
##  $ HouseholdIncome       : int  48863 48863 48863 48863 48863 48863 48863 48863 48863 48863 ...
##  $ ParkAccess            : int  20 20 20 20 20 20 20 20 20 20 ...
##  $ CrimeRate             : int  300 300 300 300 300 300 300 300 300 300 ...
##  $ UnemploymentRate      : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ WaterSafety           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HighSchoolRate        : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ SomeCollegeRate       : num  54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 ...
##  $ AccessToRecFacility   : num  7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 ...
##  $ FastFoodPercentage    : int  52 52 52 52 52 52 52 52 52 52 ...
##  $ PM2.5                 : num  13.1 13.1 13.1 13.1 13.1 ...
##  $ PopulationDensity     : num  91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 ...
##  - attr(*, "na.action")= 'omit' Named int [1:119348] 3 8 9 10 11 22 26 27 29 32 ...
##   ..- attr(*, "names")= chr [1:119348] "3" "8" "9" "10" ...

Recoding the Variables

df.omit <- within(df.omit, {
  wellbeing <- Recode(wellbeing, '"1" = "Poor_Wellbeing"; "0" = "Good_Wellbeing"', as.factor=TRUE, to.value="=", interval=":", 
  separator=";")
})
df.omit <- within(df.omit, {
  CurrentSmoker <- Recode(CurrentSmoker, '"1" = "Current Smoker"; "0" = "Not a Current Smoker"; ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})
df.omit <- within(df.omit, {
  HeavyDrinker <- Recode(HeavyDrinker, '"1" = "Heavy Drinker"; "0" = "Not a Heavy Drinker"; ; ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})

df.omit <- within(df.omit, {
  HealthInsurance <- Recode(HealthInsurance , '"1" = "With Health Coverage"; "0" = "No Health Coverage"', as.factor=TRUE, to.value="=", interval=":", 
  separator=";")
})
df.omit <- within(df.omit, {
  CVD              <- Recode(CVD, '"1" = "Cardiovascular Disease"; "0" = "No Cardiovascular Disease"; ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})
df.omit <- within(df.omit, {
  Diabetes        <- Recode(Diabetes, '"1" = "Diabetic"; "0" = "Non Diabetic"; ; ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})

df.omit <- within(df.omit, {
  LimitedActivity <- Recode(LimitedActivity , '"1" = "Limited"; "0" = "Not Limited"; ; ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})

df.omit <- within(df.omit, {
  PhysicalActivity <- Recode(PhysicalActivity , '"1" = "Adequate Activity"; "0" = "Low Activity"; ; ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})

df.omit <- within(df.omit, {
  employ <- Recode(employ , '"1" = "Waged Employement"; "2" = "Self-Employed";"3" = "Unemployed for more than 1 yr" ;"4" = "Unemployed for less than 1 yr";"5" = "Homemaker";"6" = "Student";"7" = "Retired";"8" = "Unable to work" ;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})

df.omit <- within(df.omit, {
  BMI <- Recode(BMI , '"1" = "Low"; "2" = "Medium";"3" = "High" ;;;', as.factor=TRUE, to.value="=", 
  interval=":", separator=";")
})

df.omit$income <- recode_factor(df.omit$income, 
                                       "1" = "less than 15000", 
                                       "2" = "[15000-25000)", 
                                       "3" = "[25000-35000)",
                                       "4" = "[35000-50000)", 
                                       "5" = "50000 or more")

df.omit$Asthma <- recode_factor(df.omit$Asthma, '0' = "No", '1' = "Yes")

df.omit$SocialSupport <- recode_factor(df.omit$SocialSupport, '1' = "Never", '2' = " Rarely", '3' = " Sometimes", '4' = "Usually", '5' = "Always")

df.omit$race <- recode_factor(df.omit$race, '0' = "Other races", '1' = "White")

df.omit$GeneralHealth <- recode_factor(df.omit$GeneralHealth, '1' = "Excellent",
                                      '2' = "Very good",
                                      '3' = "Good",
                                      '4' = "Fair",
                                      '5' = "Poor")

Saving the Furbished File

write.csv(df.omit,"Final_Furbished.csv",row.names = FALSE)

Descriptive Analytics

Access the data from the furbished file

df_final = read.csv("Final_Furbished.csv")
variables_recoded = c("wellbeing","SocialSupport","race","income","GeneralHealth","Asthma","HealthInsurance","CVD","LimitedActivity","Diabetes","employ","BMI","HeavyDrinker","CurrentSmoker","PhysicalActivity","marital")
for (i in variables_recoded){
  df_final[,i] <- as.factor(df_final[,i])
}
summary(df_final)
##        X               age                 wellbeing         SocialSupport   
##  Min.   :     1   Min.   : 7.00   Good_Wellbeing:261596    Rarely   :  8651  
##  1st Qu.: 98518   1st Qu.:44.00   Poor_Wellbeing: 14753    Sometimes: 29784  
##  Median :196784   Median :56.00                           Always    :142509  
##  Mean   :197108   Mean   :55.43                           Never     : 12674  
##  3rd Qu.:295423   3rd Qu.:67.00                           Usually   : 82731  
##  Max.   :395697   Max.   :99.00                                              
##                                                                              
##           race                    income         GeneralHealth  
##  Other races: 49945   [15000-25000)  : 46105   Excellent:52458  
##  White      :226404   [25000-35000)  : 32252   Fair     :33796  
##                       [35000-50000)  : 42101   Good     :81357  
##                       50000 or more  :127462   Poor     :14504  
##                       less than 15000: 28429   Very good:94234  
##                                                                 
##                                                                 
##  PoorPhysicalHealthDays PoorMentalHealthDays Asthma      
##  Min.   : 0.000         Min.   : 0.000       No :250697  
##  1st Qu.: 0.000         1st Qu.: 0.000       Yes: 25652  
##  Median : 0.000         Median : 0.000                   
##  Mean   : 4.201         Mean   : 3.435                   
##  3rd Qu.: 3.000         3rd Qu.: 2.000                   
##  Max.   :30.000         Max.   :30.000                   
##                                                          
##              HealthInsurance                          CVD        
##  No Health Coverage  : 27420   Cardiovascular Disease   : 17975  
##  With Health Coverage:248929   No Cardiovascular Disease:258374  
##                                                                  
##                                                                  
##                                                                  
##                                                                  
##                                                                  
##     LimitedActivity           Diabetes     
##  Limited    : 72410   Diabetic    : 32822  
##  Not Limited:203939   Non Diabetic:243527  
##                                            
##                                            
##                                            
##                                            
##                                            
##                            employ           BMI        
##  Waged Employement            :121619   High  : 79736  
##  Retired                      : 74731   Low   : 94694  
##  Self-Employed                : 23167   Medium:101919  
##  Homemaker                    : 18497                  
##  Unable to work               : 17547                  
##  Unemployed for more than 1 yr:  8846                  
##  (Other)                      : 11942                  
##               HeavyDrinker                 CurrentSmoker    PoorSleepDays  
##  Heavy Drinker      : 14151   Current Smoker      : 44197   Min.   : 0.00  
##  Not a Heavy Drinker:262198   Not a Current Smoker:232152   1st Qu.: 0.00  
##                                                             Median : 3.00  
##                                                             Mean   : 7.77  
##                                                             3rd Qu.:12.00  
##                                                             Max.   :30.00  
##                                                                            
##           PhysicalActivity  marital    PrematureMortality HouseholdIncome 
##  Adequate Activity:207843   0:116948   Min.   :133.5      Min.   : 23751  
##  Low Activity     : 68506   1:159401   1st Qu.:290.6      1st Qu.: 41796  
##                                        Median :341.3      Median : 48693  
##                                        Mean   :350.7      Mean   : 50816  
##                                        3rd Qu.:400.2      3rd Qu.: 56750  
##                                        Max.   :883.5      Max.   :119525  
##                                                                           
##    ParkAccess       CrimeRate      UnemploymentRate  WaterSafety     
##  Min.   :  1.00   Min.   :  12.0   Min.   : 1.100   Min.   :  0.000  
##  1st Qu.: 13.00   1st Qu.: 200.0   1st Qu.: 6.900   1st Qu.:  0.000  
##  Median : 33.00   Median : 343.0   Median : 8.400   Median :  1.000  
##  Mean   : 34.77   Mean   : 397.8   Mean   : 8.626   Mean   :  6.423  
##  3rd Qu.: 53.00   3rd Qu.: 544.0   3rd Qu.:10.200   3rd Qu.:  5.000  
##  Max.   :100.00   Max.   :1627.0   Max.   :29.700   Max.   :100.000  
##                                                                      
##  HighSchoolRate   SomeCollegeRate AccessToRecFacility FastFoodPercentage
##  Min.   : 27.00   Min.   :24.20   Min.   : 0.00       Min.   :  8.00    
##  1st Qu.: 74.00   1st Qu.:54.70   1st Qu.: 7.20       1st Qu.: 44.00    
##  Median : 81.00   Median :61.90   Median : 9.90       Median : 50.00    
##  Mean   : 79.36   Mean   :61.13   Mean   :10.34       Mean   : 48.22    
##  3rd Qu.: 86.00   3rd Qu.:68.30   3rd Qu.:13.00       3rd Qu.: 54.00    
##  Max.   :100.00   Max.   :90.40   Max.   :51.60       Max.   :100.00    
##                                                                         
##      PM2.5       PopulationDensity
##  Min.   : 6.50   Min.   :    1.7  
##  1st Qu.:10.83   1st Qu.:   69.8  
##  Median :11.69   Median :  290.2  
##  Mean   :11.59   Mean   : 1036.1  
##  3rd Qu.:12.78   3rd Qu.:  991.3  
##  Max.   :14.78   Max.   :69468.4  
## 

Checking distribution of age, asthma, social support, race, general health and income

df_final %>%
ggplot(aes(x = wellbeing, fill = wellbeing)) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Wellbeing Distribution") + xlab("Wellbeing") + ylab("Percentage") 
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.

df_final$Age_group <- ifelse(df_final$age < 25, "Age 18 to 24",ifelse(df_final$age < 35, "Age 25 to 34",ifelse(df_final$age < 45, "Age 35 to 44", ifelse(df_final$age < 55, "Age 44 to 54",ifelse(df_final$age < 65, "Age 55 to 64", "Age 65 or older")))))
df_final %>%
ggplot(aes(x = reorder(Age_group, Age_group, length, decreasing = FALSE), fill = Age_group, angle = 45)) +
geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Age Group Distribution") + xlab("Age group") + ylab("Percentage") + theme(legend.position = "none",axis.text.x = element_text(angle = 45))

df_final %>%
ggplot(aes(x = SocialSupport, fill = SocialSupport)) + geom_bar() + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Social Support Distribution") + xlab("Social Support") + ylab("Percentage")

df_final %>%
ggplot(aes(x = income, fill = income)) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Income Distribution") + xlab("Income") + ylab("Percentage") + theme(legend.position = "none",axis.text.x = element_text(angle = 45))

df_final %>%
ggplot(aes(x = GeneralHealth, fill = GeneralHealth)) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "General Health Distribution") + xlab("General Health Status") + ylab("Percentage") + theme(legend.position = "none",axis.text.x = element_text(angle = 45))

df_final %>%
ggplot(aes(x = Asthma, fill = Asthma)) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Asthma Distribution") + xlab("Asthma") + ylab("Percentage") 

df_final %>%
ggplot(aes(x = race, fill = race)) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Race Distribution") + xlab("Race") + ylab("Percentage") 

Bivariate analysis

p1a <- df_final %>%
 ggplot(aes(x = SocialSupport, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on different social support level")
p1a

p2 <- df_final %>%
  ggplot(aes(x = income, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + theme(axis.text.x = element_text(angle = 30)) + labs(title = "Wellbeing status on different income status")
p2

p3 <- df_final %>%
  ggplot(aes(x = race, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on different races")
p3

p4 <- df_final %>%
  ggplot(aes(x = Asthma, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status between people having or not Asthma")
p4

p5 <- df_final %>%
  ggplot(aes(x = Age_group, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on different age group") + theme(axis.text.x = element_text(angle = 30))
p5

p6 <- df_final %>%
  ggplot(aes(x = GeneralHealth, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on different general health status")
p6 

p8 <- df_final %>%
  ggplot(aes(x = CVD, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on patient with and without CVD")
p8 

Association between Wellbeing and Race

table(df_final$wellbeing, df_final$race)
##                 
##                  Other races  White
##   Good_Wellbeing       46677 214919
##   Poor_Wellbeing        3268  11485
rx.p1 <- df_final %>% 
  ggplot(aes(x = race, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

rx.p2 <- df_final %>% 
  ggplot(aes(x = race, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
rx.p1 + rx.p2


There seems to be minimal visual difference in the percentages of the Poor and Good Wellbeing for Race Non white and Hispanic and Race White and Hispanic

Association between Wellbeing and Smoking Status

table(df_final$wellbeing, df_final$HealthInsurance)
##                 
##                  No Health Coverage With Health Coverage
##   Good_Wellbeing              24204               237392
##   Poor_Wellbeing               3216                11537
rx.p1 <- df_final %>% 
  ggplot(aes(x = HealthInsurance, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

rx.p2 <- df_final %>% 
  ggplot(aes(x = HealthInsurance, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
rx.p1 + rx.p2

From the Visual Evidence, it could be said that the Health Insurance affect the Wellbeing of an individual. It could interpreted from the graph that more proportions of individuals without Health Insurance possessing poor wellbeing as compared to those who have Health Coverage.

Association between Wellbeing and CVD

table(df_final$wellbeing, df_final$CVD)
##                 
##                  Cardiovascular Disease No Cardiovascular Disease
##   Good_Wellbeing                  16348                    245248
##   Poor_Wellbeing                   1627                     13126
rx.p1 <- df_final %>% 
  ggplot(aes(x = CVD, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

rx.p2 <- df_final %>% 
  ggplot(aes(x = CVD, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
rx.p1 + rx.p2

There seems to be visual difference in the percentages of the Poor and Good Wellbeing for poeple with and without CVD

Inspecting the Relation Between LimitedActivity and Wellbeing

table(df_final$LimitedActivity, df_final$wellbeing)
##              
##               Good_Wellbeing Poor_Wellbeing
##   Limited              63161           9249
##   Not Limited         198435           5504
 df_final %>% 
  ggplot(aes(x = LimitedActivity , fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

df_final %>% 
  ggplot(aes(x = LimitedActivity , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

It could be confirmed visually that with Low Physical Activitys there is a significant increase in proportion of the Poor Wellbeing of individuals

Association between Wellbeing and Diabetes

table(df_final$wellbeing, df_final$Diabetes)
##                 
##                  Diabetic Non Diabetic
##   Good_Wellbeing    30091       231505
##   Poor_Wellbeing     2731        12022
rx.p1 <- df_final %>% 
  ggplot(aes(x = Diabetes, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

rx.p2 <- df_final %>% 
  ggplot(aes(x = Diabetes, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
rx.p1 + rx.p2


There seems to be a visual difference in the percentages of the Poor and Good Wellbeing between Diabetic and Non Diabetic individuals.

Association between Wellbeing and Employment

table(df_final$wellbeing, df_final$employ)
##                 
##                  Homemaker Retired Self-Employed Student Unable to work
##   Good_Wellbeing     17917   72319         22360    3831          13376
##   Poor_Wellbeing       580    2412           807     230           4171
##                 
##                  Unemployed for less than 1 yr Unemployed for more than 1 yr
##   Good_Wellbeing                          6808                          7310
##   Poor_Wellbeing                          1073                          1536
##                 
##                  Waged Employement
##   Good_Wellbeing            117675
##   Poor_Wellbeing              3944
 df_final %>% 
  ggplot(aes(x = employ, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

df_final %>% 
  ggplot(aes(x = employ, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")


–> There seems to be minimal visual difference in the percentages of the Poor and Good Wellbeing for Homemaker, Retired and Waged Employment
–> There is relatively higher proportions of Poor Wellbeing in students than the prior-mentioned employment groups.
–> Individuals who are unable to work or are unemployed (either more than a year or less than a year) have significantly higher proportions of Poor Wellbeing than any other employment status groups.

Association between Wellbeing and BMI

table(df_final$wellbeing, df_final$BMI)
##                 
##                   High   Low Medium
##   Good_Wellbeing 74077 90142  97377
##   Poor_Wellbeing  5659  4552   4542
 df_final %>% 
  ggplot(aes(x = BMI, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

df_final %>% 
  ggplot(aes(x = BMI, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")


Individuals with higher BMI tend to have slightly higher proportions of “POOR” wellbeing.

Inspecting the Relation Between Poor Sleep Days and Wellbeing

table(df_final$PoorSleepDays, df_final$wellbeing)
##     
##      Good_Wellbeing Poor_Wellbeing
##   0           93450           2530
##   1            8071            164
##   2           21513            488
##   3           14940            418
##   4            9864            322
##   5           20947            734
##   6            3994            181
##   7            5835            311
##   8            2419            108
##   9             251             17
##   10          17947           1009
##   11             61              9
##   12           1630            115
##   13             88              8
##   14           1736            160
##   15          15834           1399
##   16            234             34
##   17            137             24
##   18            269             37
##   19             34              5
##   20          11968           1344
##   21            410             64
##   22            186             34
##   23             79             21
##   24            181             26
##   25           4416            633
##   26            240             36
##   27            294             40
##   28           1421            201
##   29            642             84
##   30          22505           4197
 df_final %>% 
  ggplot(aes(x = PoorSleepDays , fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

df_final %>% 
  ggplot(aes(x = PoorSleepDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

It could be seen visually that with increment in the Poor Sleep days there is a significant increase in the Poor Wellbeing of indivduals.

Inspecting the Relation Between PhysicalActivity and Wellbeing

table(df_final$PhysicalActivity, df_final$wellbeing)
##                    
##                     Good_Wellbeing Poor_Wellbeing
##   Adequate Activity         199787           8056
##   Low Activity               61809           6697
x1 = df_final %>% 
  ggplot(aes(x = PhysicalActivity , fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")


x2 =df_final %>% 
  ggplot(aes(x = PhysicalActivity , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")
library(patchwork)
x1+x2

It could be confirmed visually that with Low Physical Activitys there is a significant increase in proportion of the Poor Wellbeing of individuals

Association between Wellbeing and Race

table(df_final$wellbeing, df_final$race)
##                 
##                  Other races  White
##   Good_Wellbeing       46677 214919
##   Poor_Wellbeing        3268  11485
rx.p1 <- df_final %>% 
  ggplot(aes(x = race, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

rx.p2 <- df_final %>% 
  ggplot(aes(x = race, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
rx.p1 + rx.p2

There seems to be minimal visual difference in the percentages of the Poor and Good Wellbeing for Heavy Drinker and not a Heavy Drinker.

Association between Wellbeing and Smoking Status

table(df_final$wellbeing, df_final$CurrentSmoker)
##                 
##                  Current Smoker Not a Current Smoker
##   Good_Wellbeing          38944               222652
##   Poor_Wellbeing           5253                 9500
rx.p1 <- df_final %>% 
  ggplot(aes(x = CurrentSmoker, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

rx.p2 <- df_final %>% 
  ggplot(aes(x = CurrentSmoker, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
rx.p1 + rx.p2

From the Visual Evidence, it could be said that the Smoking Status affect the Wellbeing of an individual. It could interpreted from the graph that more number of current smokers possess poor wellbeing as compared to those who are not a current smoker.

Association between Wellbeing and Heavy Drinking Habits

table(df_final$wellbeing, df_final$HeavyDrinker)
##                 
##                  Heavy Drinker Not a Heavy Drinker
##   Good_Wellbeing         13189              248407
##   Poor_Wellbeing           962               13791
rx.p1 <- df_final %>% 
  ggplot(aes(x = HeavyDrinker, fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

rx.p2 <- df_final %>% 
  ggplot(aes(x = HeavyDrinker, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
rx.p1 + rx.p2

There seems to be minimal visual difference in the percentages of the Poor and Good Wellbeing for Heavy Drinker and not a Heavy Drinker.

Inspecting the Relation Between Poor Physical Health Days and Wellbeing

table(df_final$PoorPhysicalHealthDays, df_final$wellbeing)
##     
##      Good_Wellbeing Poor_Wellbeing
##   0          170204           4631
##   1           12093            417
##   2           15446            735
##   3            8583            544
##   4            4544            291
##   5            7908            552
##   6            1245            114
##   7            4443            351
##   8             779             61
##   9             142             22
##   10           5684            562
##   11             48              6
##   12            509             79
##   13             59              6
##   14           2465            220
##   15           4756            708
##   16            109             35
##   17             68             21
##   18            132             33
##   19             20              8
##   20           2968            591
##   21            574             70
##   22             67             13
##   23             44              9
##   24             52             17
##   25           1197            295
##   26             64              7
##   27             93             25
##   28            436            129
##   29            172             71
##   30          16692           4130
 df_final %>% 
  ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

df_final %>% 
  ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

It could be seen visually that with increment in the Poor Physical Health days there is a significant increase in the Poor Wellbeing of indivduals

Inspecting the Relation Between Poor Mental Health Days and Wellbeing

table(df_final$PoorMentalHealthDays, df_final$wellbeing)
##     
##      Good_Wellbeing Poor_Wellbeing
##   0          183836           2912
##   1            9224            230
##   2           14293            550
##   3            7969            399
##   4            3897            278
##   5            9624            706
##   6             914            110
##   7            3276            345
##   8             702             78
##   9              88             19
##   10           6492            924
##   11             21              6
##   12            370             77
##   13             36             10
##   14           1191            197
##   15           5147           1155
##   16             61             16
##   17             45             17
##   18             62             28
##   19              9              3
##   20           2960            996
##   21            217             70
##   22             35             21
##   23             22              4
##   24             40             10
##   25            976            422
##   26             33             10
##   27             66             34
##   28            285            122
##   29            188             70
##   30           9517           4934
 df_final %>% 
  ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) + 
  geom_bar() + 
  theme(legend.position = "none")

df_final %>% 
  ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

It could be seen visually that with increment in the Poor Mental Health days there is a significant increase in the Poor Wellbeing of indivduals.

Compare the increment of Poor Wellbeing in Poor Mental and Poor Physical Wellbeing

r1 = df_final %>% 
  ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

r2 = df_final %>% 
  ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion")

library(patchwork)
r1+r2

It is visually prominent that in both cases whenevr there is increase in the number of Poor Physical Health days or Poor Mental Health Days there seems to be almost linear increase in the Poor Wellbeing of the individuals

This also coduld be deduced from the graph that there is a clear association between Poor Health Days (Mental or Physical) and the Wellbeing of an indivdual.



The more interesting part to notice in here is that The increment in proportions of “Poor Wellbeing” in the Poor Mental Health days is quite more there is a significant increase in the Poor Wellbeing of indivduals.

Inspecting any presence of correlation between the Poor Physical and Poor Mental Health days

df_final %>% ggplot(aes(x=PoorPhysicalHealthDays,y=PoorMentalHealthDays,colour =wellbeing)) + geom_point() + geom_abline(intercept=30, slope=-1)

Visual Representation rejects my hypothesis that there is any correlation between the Poor Physical and Poor Mental heath days.



1. No presence of correlation between the Poor Mental and Poor Health Days
2. If the Graph is colored based on the Well being of an individual some thing very important starts to pop up :
We can visually bifurcate the presence of “Poor Wellbeing and Good Wellbeing” with a line y=-x where the upper half denotes More Poor Physical and More Poor Mental Health days where individauls have Poor Wellbeing
3. The sparse distribution of points in the upper half of the line signifies relatively lesser records of individuals who have higher poor mental or physical health days.

Inspecting any presence of correlation between the Some College Rate and Some High School Rate in the presence of Wellbeing

df_final %>% ggplot(aes(x=SomeCollegeRate,y=HighSchoolRate,colour = wellbeing)) + geom_point() + facet_grid(~wellbeing)

There is a clear presence of correlation between the two variables. That indicates that with increase in the College Rate there is parallel increment in School Rate specific to the county.

It is also evident that the distribution more or less remains the same for both wellbeings.

We could reject the possibility of interaction on wellbeing on High School and Some College Rates

Inspecting any presence of correlation between the PM2.5 and Population Density in the presence of Wellbeing

df_final %>% ggplot(aes(x=PM2.5,y=PopulationDensity,colour = wellbeing)) + geom_point() + facet_wrap(~wellbeing) +ylim(0,15000)
## Warning: Removed 926 rows containing missing values (`geom_point()`).

There is a clear presence of correlation between the two variables. That indicates with increase in the Population Density there is increment in the PM2.5 specific to the county.

It is also evident that the distribution more or less remains the same for both wellbeings.

We could reject the possibility of interaction on wellbeing on High School and Some College Rates

df_final %>% ggplot(aes(x=SomeCollegeRate,y=HighSchoolRate,colour = wellbeing)) + geom_point() + facet_grid(HeavyDrinker~wellbeing)

Correlation between Numeric Variables

# library(corrplot)
# library(sjPlot)
rr_county <- df_final[,c(2,22:33)]
rr_cm <- as.matrix(rr_county)

corr_m <- cor(rr_cm, use="pairwise.complete.obs")

Correlation Table

# correlation matrix table
tab_corr(rr_county,
         na.deletion="pairwise",
         corr.method="pearson",
         title="Correlations with pair-wise NA deletion",
         show.p=TRUE)
Correlations with pair-wise NA deletion
  age PrematureMortality HouseholdIncome ParkAccess CrimeRate UnemploymentRate WaterSafety HighSchoolRate SomeCollegeRate AccessToRecFacility FastFoodPercentage PM2.5 PopulationDensity
age   0.023*** -0.056*** -0.036*** -0.007*** 0.020*** 0.006** -0.013*** -0.038*** -0.003 -0.051*** -0.016*** -0.015***
PrematureMortality 0.023***   -0.681*** -0.308*** 0.398*** 0.336*** 0.103*** -0.402*** -0.609*** -0.445*** 0.276*** 0.208*** -0.043***
HouseholdIncome -0.056*** -0.681***   0.361*** -0.211*** -0.361*** -0.093*** 0.366*** 0.660*** 0.480*** -0.034*** -0.093*** 0.141***
ParkAccess -0.036*** -0.308*** 0.361***   0.290*** -0.109*** -0.137*** -0.115*** 0.486*** 0.220*** 0.176*** -0.130*** 0.373***
CrimeRate -0.007*** 0.398*** -0.211*** 0.290***   0.268*** -0.092*** -0.523*** -0.067*** -0.122*** 0.262*** 0.001 0.273***
UnemploymentRate 0.020*** 0.336*** -0.361*** -0.109*** 0.268***   -0.048*** -0.329*** -0.427*** -0.311*** 0.043*** -0.103*** 0.019***
WaterSafety 0.006** 0.103*** -0.093*** -0.137*** -0.092*** -0.048***   0.048*** -0.128*** -0.064*** 0.021*** 0.004* -0.071***
HighSchoolRate -0.013*** -0.402*** 0.366*** -0.115*** -0.523*** -0.329*** 0.048***   0.242*** 0.248*** -0.157*** -0.071*** -0.168***
SomeCollegeRate -0.038*** -0.609*** 0.660*** 0.486*** -0.067*** -0.427*** -0.128*** 0.242***   0.530*** -0.024*** 0.005* 0.200***
AccessToRecFacility -0.003 -0.445*** 0.480*** 0.220*** -0.122*** -0.311*** -0.064*** 0.248*** 0.530***   -0.262*** -0.009*** 0.183***
FastFoodPercentage -0.051*** 0.276*** -0.034*** 0.176*** 0.262*** 0.043*** 0.021*** -0.157*** -0.024*** -0.262***   0.145*** -0.012***
PM2.5 -0.016*** 0.208*** -0.093*** -0.130*** 0.001 -0.103*** 0.004* -0.071*** 0.005* -0.009*** 0.145***   -0.020***
PopulationDensity -0.015*** -0.043*** 0.141*** 0.373*** 0.273*** 0.019*** -0.071*** -0.168*** 0.200*** 0.183*** -0.012*** -0.020***  
Computed correlation used pearson-method with pairwise-deletion.

Correlation Visualization

# correlogram
testRes <- cor.mtest(rr_cm, conf.level = 0.95)
corrplot.mixed(corr_m,
               upper = "ellipse",
               p.mat = testRes$p,
               title="Correlation Matrix of Variables of Interest")

### Highly Correlated Groups

pairs.panels(df_final[, c(23:24,26,29:31)], scale = TRUE, ellipses = FALSE, pch = ".", lm = TRUE, rug = FALSE, stars = TRUE)

Association between Wellbeing and County Variables

# density plots

premort_wb_plot <- df_final %>% 
  ggdensity(  x = "PrematureMortality" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", alpha=0.25)

hincome_wb_plot <- df_final %>% 
  ggdensity(  x = "HouseholdIncome" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", alpha=0.25) + 
  theme(legend.position="none")

park_wb_plot <- df_final %>% 
  ggdensity(  x = "ParkAccess" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", alpha=0.25) +
  theme(legend.position="none")

crime_wb_plot <- df_final %>% 
  ggdensity(  x = "CrimeRate" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", alpha=0.25) +
  theme(legend.position="none")

unemp_wb_plot <- df_final %>% 
  ggdensity(  x = "UnemploymentRate" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", alpha=0.25)

water_wb_plot <- df_final %>% 
  ggdensity(  x = "WaterSafety" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", alpha=0.25) +
  theme(legend.position="none")

rec_wb_plot <- df_final %>% 
  ggdensity(  x = "AccessToRecFacility" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", alpha=0.25) +
  theme(legend.position="none")

fastfood_wb_plot <- df_final %>% 
  ggdensity(  x = "FastFoodPercentage" ,
     add = "mean", rug = FALSE,
     color = "wellbeing", fill = "wellbeing", alpha=0.25) +
  theme(legend.position="none")

Premature Mortality Plots

# premature mortality
premort_wb_plot + ggtitle("Density plot of county premature mortality by wellbeing") + theme(legend.position="right") + theme_bw()

# county level factors won't changed based on wellbeing, which is individual
df_final %>%
  ggplot(aes(x=HouseholdIncome, y=PrematureMortality, group=wellbeing, color=wellbeing)) +
  geom_point(shape=1,alpha=0.5) +
  theme_bw() +
  stat_smooth(method="lm", formula=y~x, geom="smooth", se=TRUE) +
  ggtitle("Scatter Plot of County-Level Mean Household Income and\nNumber of Premature Deaths per 100,000") + 
  facet_grid(~wellbeing)

df_final %>%
  ggplot(aes(x=HouseholdIncome, y=PrematureMortality, group=wellbeing, color=wellbeing)) +
  geom_point(alpha=0) +
  theme_bw() +
  stat_smooth(method="lm", formula=y~x, geom="smooth", se=TRUE) +
  ggtitle("Trendlines of County-Level Mean Household Income and\nNumber of Premature Deaths per 100,000") 

Counties with higher mean household incomes tended to have lower premature mortality rates. This trend appears consistent across individuals who reported having good wellbeing and those that reported having poor wellbeing.

df_final %>%
  ggplot(aes(x=income, y=PrematureMortality, group=income, color=income)) +
  geom_jitter(shape=1,alpha=0.5) +
  theme_bw() +
  ggtitle("Jitter Plot of Individual Income Group and\nNumber of Premature Deaths per 100,000") + 
  facet_grid(~wellbeing)

df_final %>%
  ggplot(aes(x=income, y=PrematureMortality, group=income, color=income)) +
  geom_boxplot() +
  theme_bw() +
  ggtitle("Box Plot of Individual Income Group and\nNumber of Premature Deaths per 100,000") + 
  facet_grid(~wellbeing)

Household Income Plots

hincome_wb_plot + ggtitle("Density plot of county household income by wellbeing") + theme(legend.position="right") + theme_bw()

df_final %>%
  ggplot(aes(x=income, y=HouseholdIncome, group=income, color=income)) +
  geom_jitter(shape=1,alpha=0.5) +
  theme_bw() +
  ggtitle("Jitter Plot of County-Level Mean Household Income and\nIndividual Income Group") + 
  facet_grid(~wellbeing)

df_final %>%
  ggplot(aes(x=income, y=HouseholdIncome, group=income, color=income)) +
  geom_boxplot() +
  theme_bw() +
  ggtitle("Box Plot of County-Level Mean Household Income and\nInividual Income Group") + 
  facet_grid(~wellbeing)

Among those with good wellbeing, there appear to be more indviduals living in counties with high household incomes.

Bin HouseholdIncome

# bin mean household income to match individual income bins
df_final <- df_final %>%
  mutate(HouseholdIncome.bin = cut(HouseholdIncome, breaks=c(0, 24999, 34999, 49999, 119525)))

df_final$HouseholdIncome.bin <- recode_factor(df_final$HouseholdIncome.bin, "(0,2.5e+04]" = "less than 25000", "(2.5e+04,3.5e+04]" = "[25000,35000)", "(3.5e+04,5e+04]" = "[35000,50000)","(5e+04,1.2e+05]" = "50000 or more")
frq(df_final$HouseholdIncome.bin)
## x <categorical> 
## # total N=276349 valid N=276349 mean=3.40 sd=0.61
## 
## Value           |      N | Raw % | Valid % | Cum. %
## ---------------------------------------------------
## less than 25000 |    215 |  0.08 |    0.08 |   0.08
## [25000,35000)   |  17342 |  6.28 |    6.28 |   6.35
## [35000,50000)   | 131804 | 47.69 |   47.69 |  54.05
## 50000 or more   | 126988 | 45.95 |   45.95 | 100.00
## <NA>            |      0 |  0.00 |    <NA> |   <NA>
# create equal sized bins
library(funModeling)
df_final$HouseholdIncome.bin.equal <- equal_freq(var=df_final$HouseholdIncome, n_bins=5)
frq(df_final$HouseholdIncome.bin.equal)
## x <categorical> 
## # total N=276349 valid N=276349 mean=2.99 sd=1.41
## 
## Value          |     N | Raw % | Valid % | Cum. %
## -------------------------------------------------
## [23751, 40704) | 55598 | 20.12 |   20.12 |  20.12
## [40704, 46233) | 55049 | 19.92 |   19.92 |  40.04
## [46233, 51106) | 55648 | 20.14 |   20.14 |  60.18
## [51106, 60397) | 55294 | 20.01 |   20.01 |  80.18
## [60397,119525] | 54760 | 19.82 |   19.82 | 100.00
## <NA>           |     0 |  0.00 |    <NA> |   <NA>
tab_xtab(df_final$HouseholdIncome.bin, df_final$wellbeing, show.col.prc = TRUE, show.exp = TRUE)  
HouseholdIncome.bin wellbeing Total
Good_Wellbeing Poor_Wellbeing
less than 25000 204
204
0.1 %
11
11
0.1 %
215
215
0.1 %
[25000,35000) 16218
16416
6.2 %
1124
926
7.6 %
17342
17342
6.3 %
[35000,50000) 124246
124768
47.5 %
7558
7036
51.2 %
131804
131804
47.7 %
50000 or more 120928
120209
46.2 %
6060
6779
41.1 %
126988
126988
46 %
Total 261596
261596
100 %
14753
14753
100 %
276349
276349
100 %
χ2=166.310 · df=3 · Cramer’s V=0.025 · p=0.000
# compare againt individual income
df_final %>% 
  group_by(HouseholdIncome.bin) %>%
  ggplot(aes(x = income, fill = wellbeing)) + 
  geom_bar(position = "fill") + 
  ylab("proportion") +
  facet_wrap(~HouseholdIncome.bin) +
  theme(axis.text.x = element_text(angle = 45, vjust=1, hjust=1))

Park Access Plots

park_wb_plot + ggtitle("Density plot of county park access by wellbeing") + theme(legend.position="right") + theme_bw()

Crime Rate Plots

crime_wb_plot + ggtitle("Density plot of county crime rate by wellbeing") + theme(legend.position="right") + theme_bw()

Unemployment Rate Plots

unemp_wb_plot + ggtitle("Density plot of county unemployment rate by wellbeing") + theme(legend.position="right") + theme_bw()

Water Safety Plots

water_wb_plot + ggtitle("Density plot of Water Safety Rate by wellbeing") + theme(legend.position="right") + theme_bw()

waterlog_wb_plot <- df.omit %>% 
  mutate(WaterSafetyLog = log(WaterSafety,10)) %>%
  filter(WaterSafetyLog >= 0) %>%
  ggdensity(  x = "WaterSafetyLog" ,
     color = "wellbeing", fill = "wellbeing", alpha=0.25) +
  theme(legend.position="right")

waterlog_wb_plot

Access to Recreational Facilities Plots

rec_wb_plot + ggtitle("Density plot of county recreational facility access by wellbeing") + theme(legend.position="right") + theme_bw()

Fast Food Percentage Plots

fastfood_wb_plot + ggtitle("Density plot of county fast food restaurant percentage\nby wellbeing") + theme(legend.position="right") + theme_bw()

By BMI
# BMI
df.omit %>%
  ggplot(aes(x=FastFoodPercentage, group=BMI, fill=BMI)) +
  geom_density() +
  facet_wrap(~BMI) +
  theme_bw()

By CVD
# BMI
df_final %>%
  ggplot(aes(x=FastFoodPercentage, group=CVD, fill=CVD)) +
  geom_density() +
  facet_wrap(~CVD) +
  theme_bw()

By General Health
# BMI
df_final %>%
  ggplot(aes(x=FastFoodPercentage, group=GeneralHealth, fill=GeneralHealth)) +
  geom_density() +
  facet_wrap(~GeneralHealth) +
  theme_bw()

By Diabetes
# BMI
df_final %>%
  ggplot(aes(x=FastFoodPercentage, group=Diabetes, fill=Diabetes)) +
  geom_density() +
  facet_wrap(~Diabetes) +
  theme_bw()

All Plots

multiplot(premort_wb_plot, hincome_wb_plot, park_wb_plot, crime_wb_plot, unemp_wb_plot, water_wb_plot, rec_wb_plot, fastfood_wb_plot, plotlist=NULL, cols=2)

Logistic Regression Model (Statistical Methods)

All Predictors

# set up variables 
dependent <- "wellbeing"
pred.all <- c("age","SocialSupport","race","income","GeneralHealth","PoorPhysicalHealthDays","PoorMentalHealthDays","Asthma","HealthInsurance","CVD","LimitedActivity","Diabetes","employ","BMI","HeavyDrinker","CurrentSmoker","PoorSleepDays","marital","PrematureMortality","HouseholdIncome","ParkAccess","CrimeRate","UnemploymentRate","WaterSafety","HighSchoolRate","SomeCollegeRate","AccessToRecFacility","FastFoodPercentage","PM2.5","PopulationDensity")
# initial model
model.all <- glm(wellbeing ~ 
                 age+
                 SocialSupport+
                 race+
                 income+
                 GeneralHealth+
                 PoorPhysicalHealthDays+
                 PoorMentalHealthDays+
                 Asthma+
                 HealthInsurance+
                 CVD+
                 LimitedActivity+
                 Diabetes+
                 employ+
                BMI+
                  HeavyDrinker+
                  CurrentSmoker+
                  PoorSleepDays+
                  marital+
                  PrematureMortality+
                  HouseholdIncome+
                  ParkAccess+
                  CrimeRate+
                  UnemploymentRate+
                  WaterSafety+
                  HighSchoolRate+
                  SomeCollegeRate+
                  AccessToRecFacility+
                  FastFoodPercentage+
                  PM2.5+
                  PopulationDensity,
               family=binomial(logit),
               data=df_final)

summary(model.all)
## 
## Call:
## glm(formula = wellbeing ~ age + SocialSupport + race + income + 
##     GeneralHealth + PoorPhysicalHealthDays + PoorMentalHealthDays + 
##     Asthma + HealthInsurance + CVD + LimitedActivity + Diabetes + 
##     employ + BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays + 
##     marital + PrematureMortality + HouseholdIncome + ParkAccess + 
##     CrimeRate + UnemploymentRate + WaterSafety + HighSchoolRate + 
##     SomeCollegeRate + AccessToRecFacility + FastFoodPercentage + 
##     PM2.5 + PopulationDensity, family = binomial(logit), data = df_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4901  -0.2319  -0.1418  -0.1017   3.6214  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -1.790e+00  2.297e-01  -7.796 6.39e-15 ***
## age                                 -3.389e-03  8.947e-04  -3.788 0.000152 ***
## SocialSupport Sometimes             -8.370e-01  3.213e-02 -26.048  < 2e-16 ***
## SocialSupportAlways                 -2.674e+00  3.617e-02 -73.949  < 2e-16 ***
## SocialSupportNever                  -7.537e-01  4.073e-02 -18.506  < 2e-16 ***
## SocialSupportUsually                -2.027e+00  3.450e-02 -58.765  < 2e-16 ***
## raceWhite                            3.322e-01  2.681e-02  12.392  < 2e-16 ***
## income[25000-35000)                 -5.240e-02  3.504e-02  -1.495 0.134815    
## income[35000-50000)                 -1.079e-01  3.572e-02  -3.020 0.002528 ** 
## income50000 or more                 -3.257e-01  3.407e-02  -9.560  < 2e-16 ***
## incomeless than 15000                2.193e-02  2.920e-02   0.751 0.452611    
## GeneralHealthFair                    6.709e-01  4.796e-02  13.989  < 2e-16 ***
## GeneralHealthGood                    3.648e-01  4.392e-02   8.305  < 2e-16 ***
## GeneralHealthPoor                    1.122e+00  5.505e-02  20.390  < 2e-16 ***
## GeneralHealthVery good               1.954e-01  4.478e-02   4.363 1.28e-05 ***
## PoorPhysicalHealthDays               3.232e-03  1.189e-03   2.718 0.006569 ** 
## PoorMentalHealthDays                 6.163e-02  9.233e-04  66.743  < 2e-16 ***
## AsthmaYes                           -3.949e-02  2.908e-02  -1.358 0.174478    
## HealthInsuranceWith Health Coverage -2.436e-01  2.889e-02  -8.433  < 2e-16 ***
## CVDNo Cardiovascular Disease         4.168e-02  3.589e-02   1.161 0.245546    
## LimitedActivityNot Limited          -4.983e-01  2.497e-02 -19.958  < 2e-16 ***
## DiabetesNon Diabetic                 2.359e-03  2.895e-02   0.081 0.935073    
## employRetired                       -2.884e-02  5.518e-02  -0.523 0.601184    
## employSelf-Employed                  2.616e-01  6.246e-02   4.189 2.80e-05 ***
## employStudent                        2.342e-01  9.294e-02   2.520 0.011739 *  
## employUnable to work                 4.569e-01  5.459e-02   8.370  < 2e-16 ***
## employUnemployed for less than 1 yr  9.516e-01  6.276e-02  15.162  < 2e-16 ***
## employUnemployed for more than 1 yr  9.374e-01  5.950e-02  15.755  < 2e-16 ***
## employWaged Employement              1.731e-01  5.161e-02   3.353 0.000799 ***
## BMILow                               1.101e-03  2.614e-02   0.042 0.966403    
## BMIMedium                           -6.881e-02  2.516e-02  -2.735 0.006234 ** 
## HeavyDrinkerNot a Heavy Drinker     -2.842e-01  4.217e-02  -6.739 1.59e-11 ***
## CurrentSmokerNot a Current Smoker   -2.404e-01  2.361e-02 -10.181  < 2e-16 ***
## PoorSleepDays                        1.708e-02  9.519e-04  17.941  < 2e-16 ***
## marital1                            -4.774e-01  2.349e-02 -20.323  < 2e-16 ***
## PrematureMortality                  -5.040e-04  2.040e-04  -2.470 0.013494 *  
## HouseholdIncome                      2.974e-07  1.322e-06   0.225 0.821984    
## ParkAccess                           2.059e-03  5.702e-04   3.612 0.000304 ***
## CrimeRate                            1.037e-04  5.089e-05   2.037 0.041667 *  
## UnemploymentRate                     7.035e-03  4.673e-03   1.506 0.132176    
## WaterSafety                         -2.527e-03  7.138e-04  -3.540 0.000401 ***
## HighSchoolRate                       5.128e-05  1.416e-03   0.036 0.971120    
## SomeCollegeRate                      7.678e-03  1.534e-03   5.005 5.57e-07 ***
## AccessToRecFacility                 -3.301e-03  2.659e-03  -1.241 0.214501    
## FastFoodPercentage                  -1.510e-03  1.404e-03  -1.076 0.282062    
## PM2.5                                4.498e-03  7.338e-03   0.613 0.539909    
## PopulationDensity                    4.673e-06  2.556e-06   1.828 0.067493 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 115163  on 276348  degrees of freedom
## Residual deviance:  73838  on 276302  degrees of freedom
## AIC: 73932
## 
## Number of Fisher Scoring iterations: 7

Multicollinearity

Correlations for all numeric predictors

# library(corrplot)
# library(sjPlot)
rr_county <- df_final[,c(2,22:33)]
rr_cm <- as.matrix(rr_county)

corr_m <- cor(rr_cm, use="pairwise.complete.obs")
Correlation Table
# correlation matrix table
tab_corr(rr_county,
         na.deletion="pairwise",
         corr.method="pearson",
         title="Correlations with pair-wise NA deletion",
         show.p=TRUE)
Correlations with pair-wise NA deletion
  age PrematureMortality HouseholdIncome ParkAccess CrimeRate UnemploymentRate WaterSafety HighSchoolRate SomeCollegeRate AccessToRecFacility FastFoodPercentage PM2.5 PopulationDensity
age   0.023*** -0.056*** -0.036*** -0.007*** 0.020*** 0.006** -0.013*** -0.038*** -0.003 -0.051*** -0.016*** -0.015***
PrematureMortality 0.023***   -0.681*** -0.308*** 0.398*** 0.336*** 0.103*** -0.402*** -0.609*** -0.445*** 0.276*** 0.208*** -0.043***
HouseholdIncome -0.056*** -0.681***   0.361*** -0.211*** -0.361*** -0.093*** 0.366*** 0.660*** 0.480*** -0.034*** -0.093*** 0.141***
ParkAccess -0.036*** -0.308*** 0.361***   0.290*** -0.109*** -0.137*** -0.115*** 0.486*** 0.220*** 0.176*** -0.130*** 0.373***
CrimeRate -0.007*** 0.398*** -0.211*** 0.290***   0.268*** -0.092*** -0.523*** -0.067*** -0.122*** 0.262*** 0.001 0.273***
UnemploymentRate 0.020*** 0.336*** -0.361*** -0.109*** 0.268***   -0.048*** -0.329*** -0.427*** -0.311*** 0.043*** -0.103*** 0.019***
WaterSafety 0.006** 0.103*** -0.093*** -0.137*** -0.092*** -0.048***   0.048*** -0.128*** -0.064*** 0.021*** 0.004* -0.071***
HighSchoolRate -0.013*** -0.402*** 0.366*** -0.115*** -0.523*** -0.329*** 0.048***   0.242*** 0.248*** -0.157*** -0.071*** -0.168***
SomeCollegeRate -0.038*** -0.609*** 0.660*** 0.486*** -0.067*** -0.427*** -0.128*** 0.242***   0.530*** -0.024*** 0.005* 0.200***
AccessToRecFacility -0.003 -0.445*** 0.480*** 0.220*** -0.122*** -0.311*** -0.064*** 0.248*** 0.530***   -0.262*** -0.009*** 0.183***
FastFoodPercentage -0.051*** 0.276*** -0.034*** 0.176*** 0.262*** 0.043*** 0.021*** -0.157*** -0.024*** -0.262***   0.145*** -0.012***
PM2.5 -0.016*** 0.208*** -0.093*** -0.130*** 0.001 -0.103*** 0.004* -0.071*** 0.005* -0.009*** 0.145***   -0.020***
PopulationDensity -0.015*** -0.043*** 0.141*** 0.373*** 0.273*** 0.019*** -0.071*** -0.168*** 0.200*** 0.183*** -0.012*** -0.020***  
Computed correlation used pearson-method with pairwise-deletion.
Correlation Visualization
# correlogram
testRes <- cor.mtest(rr_cm, conf.level = 0.95)
corrplot.mixed(corr_m,
               upper = "ellipse",
               p.mat = testRes$p,
               title="Correlation Matrix of Variables of Interest")

##### Highly Correlated Groups

pairs.panels(df_final[, c(23:24,26,29:31)], scale = TRUE, ellipses = FALSE, pch = ".", lm = TRUE, rug = FALSE, stars = TRUE)

VIF for all Predictors Model

library(car)
library(finalfit)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
## 
##     area
## The following object is masked from 'package:dplyr':
## 
##     select
vif_all <- vif(model.all)
vif_all <- as.data.frame(vif_all)
#create horizontal bar chart to display each VIF value
vif_plot_all <- vif_all %>%
  ggplot(aes(x=rownames(vif_all), y=GVIF)) +
  geom_col(color="#434343", fill="#6BD6DB") +
  xlab("All Predictors") +
  ylab("VIF Values") +
  ggtitle("VIF Values for All Predictors") +
  geom_hline(aes(yintercept=4),color="#434343", linetype="dashed", linewidth=0.5) +
  coord_flip()

vif_plot_all

Stepwise Regression

Stepwise function

step <- stepAIC(model.all, direction="both") # Stepwise function
## Start:  AIC=73931.82
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + Diabetes + employ + 
##     BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays + marital + 
##     PrematureMortality + HouseholdIncome + ParkAccess + CrimeRate + 
##     UnemploymentRate + WaterSafety + HighSchoolRate + SomeCollegeRate + 
##     AccessToRecFacility + FastFoodPercentage + PM2.5 + PopulationDensity
## 
##                          Df Deviance   AIC
## - HighSchoolRate          1    73838 73930
## - Diabetes                1    73838 73930
## - HouseholdIncome         1    73838 73930
## - PM2.5                   1    73838 73930
## - FastFoodPercentage      1    73839 73931
## - CVD                     1    73839 73931
## - AccessToRecFacility     1    73839 73931
## - Asthma                  1    73840 73932
## <none>                         73838 73932
## - UnemploymentRate        1    73840 73932
## - PopulationDensity       1    73841 73933
## - CrimeRate               1    73842 73934
## - PrematureMortality      1    73844 73936
## - PoorPhysicalHealthDays  1    73845 73937
## - BMI                     2    73848 73938
## - WaterSafety             1    73851 73943
## - ParkAccess              1    73851 73943
## - age                     1    73852 73944
## - SomeCollegeRate         1    73863 73955
## - HeavyDrinker            1    73881 73973
## - HealthInsurance         1    73908 74000
## - CurrentSmoker           1    73940 74032
## - income                  4    73948 74034
## - race                    1    73995 74087
## - PoorSleepDays           1    74154 74246
## - LimitedActivity         1    74231 74323
## - marital                 1    74256 74348
## - GeneralHealth           4    74386 74472
## - employ                  7    74564 74644
## - PoorMentalHealthDays    1    78138 78230
## - SocialSupport           4    81876 81962
## 
## Step:  AIC=73929.82
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + Diabetes + employ + 
##     BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays + marital + 
##     PrematureMortality + HouseholdIncome + ParkAccess + CrimeRate + 
##     UnemploymentRate + WaterSafety + SomeCollegeRate + AccessToRecFacility + 
##     FastFoodPercentage + PM2.5 + PopulationDensity
## 
##                          Df Deviance   AIC
## - Diabetes                1    73838 73928
## - HouseholdIncome         1    73838 73928
## - PM2.5                   1    73838 73928
## - FastFoodPercentage      1    73839 73929
## - CVD                     1    73839 73929
## - AccessToRecFacility     1    73839 73929
## - Asthma                  1    73840 73930
## <none>                         73838 73930
## - UnemploymentRate        1    73840 73930
## - PopulationDensity       1    73841 73931
## + HighSchoolRate          1    73838 73932
## - CrimeRate               1    73842 73932
## - PrematureMortality      1    73844 73934
## - PoorPhysicalHealthDays  1    73845 73935
## - BMI                     2    73848 73936
## - WaterSafety             1    73851 73941
## - ParkAccess              1    73851 73941
## - age                     1    73852 73942
## - SomeCollegeRate         1    73863 73953
## - HeavyDrinker            1    73881 73971
## - HealthInsurance         1    73908 73998
## - CurrentSmoker           1    73940 74030
## - income                  4    73948 74032
## - race                    1    73995 74085
## - PoorSleepDays           1    74154 74244
## - LimitedActivity         1    74232 74322
## - marital                 1    74256 74346
## - GeneralHealth           4    74386 74470
## - employ                  7    74564 74642
## - PoorMentalHealthDays    1    78138 78228
## - SocialSupport           4    81876 81960
## 
## Step:  AIC=73927.83
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + employ + BMI + 
##     HeavyDrinker + CurrentSmoker + PoorSleepDays + marital + 
##     PrematureMortality + HouseholdIncome + ParkAccess + CrimeRate + 
##     UnemploymentRate + WaterSafety + SomeCollegeRate + AccessToRecFacility + 
##     FastFoodPercentage + PM2.5 + PopulationDensity
## 
##                          Df Deviance   AIC
## - HouseholdIncome         1    73838 73926
## - PM2.5                   1    73838 73926
## - FastFoodPercentage      1    73839 73927
## - CVD                     1    73839 73927
## - AccessToRecFacility     1    73839 73927
## - Asthma                  1    73840 73928
## <none>                         73838 73928
## - UnemploymentRate        1    73840 73928
## - PopulationDensity       1    73841 73929
## + Diabetes                1    73838 73930
## + HighSchoolRate          1    73838 73930
## - CrimeRate               1    73842 73930
## - PrematureMortality      1    73844 73932
## - PoorPhysicalHealthDays  1    73845 73933
## - BMI                     2    73848 73934
## - WaterSafety             1    73851 73939
## - ParkAccess              1    73851 73939
## - age                     1    73852 73940
## - SomeCollegeRate         1    73863 73951
## - HeavyDrinker            1    73881 73969
## - HealthInsurance         1    73908 73996
## - CurrentSmoker           1    73940 74028
## - income                  4    73948 74030
## - race                    1    73996 74084
## - PoorSleepDays           1    74154 74242
## - LimitedActivity         1    74232 74320
## - marital                 1    74256 74344
## - GeneralHealth           4    74394 74476
## - employ                  7    74564 74640
## - PoorMentalHealthDays    1    78139 78227
## - SocialSupport           4    81876 81958
## 
## Step:  AIC=73925.88
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + employ + BMI + 
##     HeavyDrinker + CurrentSmoker + PoorSleepDays + marital + 
##     PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + 
##     WaterSafety + SomeCollegeRate + AccessToRecFacility + FastFoodPercentage + 
##     PM2.5 + PopulationDensity
## 
##                          Df Deviance   AIC
## - PM2.5                   1    73838 73924
## - FastFoodPercentage      1    73839 73925
## - CVD                     1    73839 73925
## - AccessToRecFacility     1    73839 73925
## - Asthma                  1    73840 73926
## <none>                         73838 73926
## - UnemploymentRate        1    73840 73926
## - PopulationDensity       1    73841 73927
## + HouseholdIncome         1    73838 73928
## + Diabetes                1    73838 73928
## + HighSchoolRate          1    73838 73928
## - CrimeRate               1    73842 73928
## - PoorPhysicalHealthDays  1    73845 73931
## - PrematureMortality      1    73846 73932
## - BMI                     2    73848 73932
## - WaterSafety             1    73851 73937
## - ParkAccess              1    73851 73937
## - age                     1    73852 73938
## - SomeCollegeRate         1    73866 73952
## - HeavyDrinker            1    73881 73967
## - HealthInsurance         1    73908 73994
## - CurrentSmoker           1    73940 74026
## - income                  4    73949 74029
## - race                    1    73996 74082
## - PoorSleepDays           1    74154 74240
## - LimitedActivity         1    74232 74318
## - marital                 1    74257 74343
## - GeneralHealth           4    74394 74474
## - employ                  7    74565 74639
## - PoorMentalHealthDays    1    78140 78226
## - SocialSupport           4    81877 81957
## 
## Step:  AIC=73924.25
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + employ + BMI + 
##     HeavyDrinker + CurrentSmoker + PoorSleepDays + marital + 
##     PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + 
##     WaterSafety + SomeCollegeRate + AccessToRecFacility + FastFoodPercentage + 
##     PopulationDensity
## 
##                          Df Deviance   AIC
## - FastFoodPercentage      1    73839 73923
## - CVD                     1    73840 73924
## - AccessToRecFacility     1    73840 73924
## - Asthma                  1    73840 73924
## <none>                         73838 73924
## - UnemploymentRate        1    73840 73924
## - PopulationDensity       1    73842 73926
## + PM2.5                   1    73838 73926
## + HouseholdIncome         1    73838 73926
## + Diabetes                1    73838 73926
## + HighSchoolRate          1    73838 73926
## - CrimeRate               1    73843 73927
## - PoorPhysicalHealthDays  1    73846 73930
## - PrematureMortality      1    73846 73930
## - BMI                     2    73848 73930
## - ParkAccess              1    73851 73935
## - WaterSafety             1    73851 73935
## - age                     1    73853 73937
## - SomeCollegeRate         1    73867 73951
## - HeavyDrinker            1    73882 73966
## - HealthInsurance         1    73908 73992
## - CurrentSmoker           1    73941 74025
## - income                  4    73949 74027
## - race                    1    73998 74082
## - PoorSleepDays           1    74155 74239
## - LimitedActivity         1    74232 74316
## - marital                 1    74258 74342
## - GeneralHealth           4    74394 74472
## - employ                  7    74565 74637
## - PoorMentalHealthDays    1    78140 78224
## - SocialSupport           4    81877 81955
## 
## Step:  AIC=73923.22
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + employ + BMI + 
##     HeavyDrinker + CurrentSmoker + PoorSleepDays + marital + 
##     PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + 
##     WaterSafety + SomeCollegeRate + AccessToRecFacility + PopulationDensity
## 
##                          Df Deviance   AIC
## - AccessToRecFacility     1    73840 73922
## - CVD                     1    73841 73923
## - Asthma                  1    73841 73923
## <none>                         73839 73923
## - UnemploymentRate        1    73841 73923
## + FastFoodPercentage      1    73838 73924
## + PM2.5                   1    73839 73925
## - PopulationDensity       1    73843 73925
## + Diabetes                1    73839 73925
## + HouseholdIncome         1    73839 73925
## + HighSchoolRate          1    73839 73925
## - CrimeRate               1    73843 73925
## - PoorPhysicalHealthDays  1    73847 73929
## - BMI                     2    73849 73929
## - PrematureMortality      1    73849 73931
## - ParkAccess              1    73851 73933
## - WaterSafety             1    73852 73934
## - age                     1    73854 73936
## - SomeCollegeRate         1    73868 73950
## - HeavyDrinker            1    73883 73965
## - HealthInsurance         1    73909 73991
## - CurrentSmoker           1    73942 74024
## - income                  4    73951 74027
## - race                    1    74000 74082
## - PoorSleepDays           1    74155 74237
## - LimitedActivity         1    74233 74315
## - marital                 1    74259 74341
## - GeneralHealth           4    74395 74471
## - employ                  7    74567 74637
## - PoorMentalHealthDays    1    78141 78223
## - SocialSupport           4    81879 81955
## 
## Step:  AIC=73922.24
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + CVD + LimitedActivity + employ + BMI + 
##     HeavyDrinker + CurrentSmoker + PoorSleepDays + marital + 
##     PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + 
##     WaterSafety + SomeCollegeRate + PopulationDensity
## 
##                          Df Deviance   AIC
## - CVD                     1    73842 73922
## - Asthma                  1    73842 73922
## <none>                         73840 73922
## - UnemploymentRate        1    73843 73923
## + AccessToRecFacility     1    73839 73923
## - PopulationDensity       1    73843 73923
## + FastFoodPercentage      1    73840 73924
## + PM2.5                   1    73840 73924
## + HouseholdIncome         1    73840 73924
## + HighSchoolRate          1    73840 73924
## + Diabetes                1    73840 73924
## - CrimeRate               1    73844 73924
## - PoorPhysicalHealthDays  1    73848 73928
## - BMI                     2    73850 73928
## - PrematureMortality      1    73849 73929
## - ParkAccess              1    73853 73933
## - WaterSafety             1    73853 73933
## - age                     1    73855 73935
## - SomeCollegeRate         1    73868 73948
## - HeavyDrinker            1    73884 73964
## - HealthInsurance         1    73910 73990
## - CurrentSmoker           1    73943 74023
## - income                  4    73952 74026
## - race                    1    74001 74081
## - PoorSleepDays           1    74156 74236
## - LimitedActivity         1    74235 74315
## - marital                 1    74260 74340
## - GeneralHealth           4    74396 74470
## - employ                  7    74567 74635
## - PoorMentalHealthDays    1    78141 78221
## - SocialSupport           4    81880 81954
## 
## Step:  AIC=73921.59
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma + 
##     HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker + 
##     CurrentSmoker + PoorSleepDays + marital + PrematureMortality + 
##     ParkAccess + CrimeRate + UnemploymentRate + WaterSafety + 
##     SomeCollegeRate + PopulationDensity
## 
##                          Df Deviance   AIC
## - Asthma                  1    73844 73922
## <none>                         73842 73922
## - UnemploymentRate        1    73844 73922
## + CVD                     1    73840 73922
## + AccessToRecFacility     1    73841 73923
## - PopulationDensity       1    73845 73923
## + FastFoodPercentage      1    73841 73923
## + PM2.5                   1    73841 73923
## + Diabetes                1    73842 73924
## + HouseholdIncome         1    73842 73924
## + HighSchoolRate          1    73842 73924
## - CrimeRate               1    73846 73924
## - PoorPhysicalHealthDays  1    73849 73927
## - BMI                     2    73852 73928
## - PrematureMortality      1    73850 73928
## - ParkAccess              1    73854 73932
## - WaterSafety             1    73855 73933
## - age                     1    73857 73935
## - SomeCollegeRate         1    73870 73948
## - HeavyDrinker            1    73885 73963
## - HealthInsurance         1    73912 73990
## - CurrentSmoker           1    73944 74022
## - income                  4    73953 74025
## - race                    1    74002 74080
## - PoorSleepDays           1    74157 74235
## - LimitedActivity         1    74235 74313
## - marital                 1    74262 74340
## - GeneralHealth           4    74400 74472
## - employ                  7    74569 74635
## - PoorMentalHealthDays    1    78145 78223
## - SocialSupport           4    81882 81954
## 
## Step:  AIC=73921.58
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth + 
##     PoorPhysicalHealthDays + PoorMentalHealthDays + HealthInsurance + 
##     LimitedActivity + employ + BMI + HeavyDrinker + CurrentSmoker + 
##     PoorSleepDays + marital + PrematureMortality + ParkAccess + 
##     CrimeRate + UnemploymentRate + WaterSafety + SomeCollegeRate + 
##     PopulationDensity
## 
##                          Df Deviance   AIC
## <none>                         73844 73922
## + Asthma                  1    73842 73922
## + CVD                     1    73842 73922
## - UnemploymentRate        1    73846 73922
## + AccessToRecFacility     1    73843 73923
## - PopulationDensity       1    73847 73923
## + FastFoodPercentage      1    73843 73923
## + PM2.5                   1    73843 73923
## + Diabetes                1    73844 73924
## + HouseholdIncome         1    73844 73924
## + HighSchoolRate          1    73844 73924
## - CrimeRate               1    73848 73924
## - PoorPhysicalHealthDays  1    73851 73927
## - BMI                     2    73854 73928
## - PrematureMortality      1    73852 73928
## - ParkAccess              1    73856 73932
## - WaterSafety             1    73857 73933
## - age                     1    73859 73935
## - SomeCollegeRate         1    73872 73948
## - HeavyDrinker            1    73887 73963
## - HealthInsurance         1    73915 73991
## - CurrentSmoker           1    73946 74022
## - income                  4    73955 74025
## - race                    1    74004 74080
## - PoorSleepDays           1    74157 74233
## - LimitedActivity         1    74235 74311
## - marital                 1    74263 74339
## - GeneralHealth           4    74400 74470
## - employ                  7    74570 74634
## - PoorMentalHealthDays    1    78145 78221
## - SocialSupport           4    81887 81957

Final Stepwise model

model.step <- glm(formula= df_final$wellbeing ~ . ,
             df_final[,(colnames(step$model[-1]))], family=binomial(logit))

summary(model.step)
## 
## Call:
## glm(formula = df_final$wellbeing ~ ., family = binomial(logit), 
##     data = df_final[, (colnames(step$model[-1]))])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5070  -0.2318  -0.1418  -0.1017   3.6267  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -1.753e+00  1.675e-01 -10.468  < 2e-16 ***
## age                                 -3.471e-03  8.841e-04  -3.926 8.64e-05 ***
## SocialSupport Sometimes             -8.367e-01  3.212e-02 -26.045  < 2e-16 ***
## SocialSupportAlways                 -2.675e+00  3.616e-02 -73.970  < 2e-16 ***
## SocialSupportNever                  -7.535e-01  4.072e-02 -18.506  < 2e-16 ***
## SocialSupportUsually                -2.027e+00  3.449e-02 -58.778  < 2e-16 ***
## raceWhite                            3.330e-01  2.667e-02  12.484  < 2e-16 ***
## income[25000-35000)                 -5.192e-02  3.503e-02  -1.482 0.138265    
## income[35000-50000)                 -1.072e-01  3.569e-02  -3.004 0.002667 ** 
## income50000 or more                 -3.254e-01  3.388e-02  -9.603  < 2e-16 ***
## incomeless than 15000                2.021e-02  2.917e-02   0.693 0.488428    
## GeneralHealthFair                    6.651e-01  4.773e-02  13.935  < 2e-16 ***
## GeneralHealthGood                    3.628e-01  4.388e-02   8.268  < 2e-16 ***
## GeneralHealthPoor                    1.112e+00  5.445e-02  20.420  < 2e-16 ***
## GeneralHealthVery good               1.951e-01  4.478e-02   4.356 1.32e-05 ***
## PoorPhysicalHealthDays               3.207e-03  1.189e-03   2.697 0.006999 ** 
## PoorMentalHealthDays                 6.159e-02  9.227e-04  66.753  < 2e-16 ***
## HealthInsuranceWith Health Coverage -2.451e-01  2.884e-02  -8.496  < 2e-16 ***
## LimitedActivityNot Limited          -4.957e-01  2.490e-02 -19.908  < 2e-16 ***
## employRetired                       -2.974e-02  5.516e-02  -0.539 0.589730    
## employSelf-Employed                  2.631e-01  6.243e-02   4.214 2.51e-05 ***
## employStudent                        2.329e-01  9.292e-02   2.507 0.012179 *  
## employUnable to work                 4.547e-01  5.455e-02   8.335  < 2e-16 ***
## employUnemployed for less than 1 yr  9.517e-01  6.275e-02  15.168  < 2e-16 ***
## employUnemployed for more than 1 yr  9.380e-01  5.947e-02  15.773  < 2e-16 ***
## employWaged Employement              1.741e-01  5.160e-02   3.374 0.000741 ***
## BMILow                               4.723e-03  2.556e-02   0.185 0.853399    
## BMIMedium                           -6.638e-02  2.487e-02  -2.669 0.007603 ** 
## HeavyDrinkerNot a Heavy Drinker     -2.851e-01  4.214e-02  -6.764 1.34e-11 ***
## CurrentSmokerNot a Current Smoker   -2.406e-01  2.359e-02 -10.199  < 2e-16 ***
## PoorSleepDays                        1.699e-02  9.506e-04  17.870  < 2e-16 ***
## marital1                            -4.774e-01  2.347e-02 -20.344  < 2e-16 ***
## PrematureMortality                  -5.103e-04  1.723e-04  -2.963 0.003051 ** 
## ParkAccess                           1.955e-03  5.490e-04   3.562 0.000368 ***
## CrimeRate                            9.820e-05  4.787e-05   2.052 0.040209 *  
## UnemploymentRate                     7.354e-03  4.563e-03   1.611 0.107094    
## WaterSafety                         -2.551e-03  7.126e-04  -3.579 0.000344 ***
## SomeCollegeRate                      7.273e-03  1.382e-03   5.265 1.40e-07 ***
## PopulationDensity                    4.517e-06  2.476e-06   1.824 0.068084 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 115163  on 276348  degrees of freedom
## Residual deviance:  73844  on 276310  degrees of freedom
## AIC: 73922
## 
## Number of Fisher Scoring iterations: 7

Remove statistically insignificant predictors

# setup variable 
pred.m1 <- c("age","SocialSupport","race","income","GeneralHealth","PoorPhysicalHealthDays","PoorMentalHealthDays","HealthInsurance","LimitedActivity","employ","BMI","HeavyDrinker","CurrentSmoker","PoorSleepDays","marital","PrematureMortality","ParkAccess","CrimeRate","UnemploymentRate","WaterSafety","SomeCollegeRate")
# model with reduced predictors
model.1 <- glm(wellbeing ~ 
                 age+
                 SocialSupport+
                 race+
                 income+
                 GeneralHealth+
                 PoorPhysicalHealthDays+
                 PoorMentalHealthDays+
                 HealthInsurance+
                 LimitedActivity+
                 employ+
                 BMI+
                  HeavyDrinker+
                  CurrentSmoker+
                  PoorSleepDays+
                  marital+
                  PrematureMortality+
                  ParkAccess+
                  CrimeRate+
                  UnemploymentRate+
                  WaterSafety+
                  SomeCollegeRate,
               family=binomial(logit),
               data=df_final)

summary(model.1)
## 
## Call:
## glm(formula = wellbeing ~ age + SocialSupport + race + income + 
##     GeneralHealth + PoorPhysicalHealthDays + PoorMentalHealthDays + 
##     HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker + 
##     CurrentSmoker + PoorSleepDays + marital + PrematureMortality + 
##     ParkAccess + CrimeRate + UnemploymentRate + WaterSafety + 
##     SomeCollegeRate, family = binomial(logit), data = df_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5102  -0.2318  -0.1419  -0.1017   3.6271  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -1.779e+00  1.669e-01 -10.656  < 2e-16 ***
## age                                 -3.456e-03  8.842e-04  -3.909 9.27e-05 ***
## SocialSupport Sometimes             -8.361e-01  3.212e-02 -26.029  < 2e-16 ***
## SocialSupportAlways                 -2.675e+00  3.616e-02 -73.972  < 2e-16 ***
## SocialSupportNever                  -7.526e-01  4.071e-02 -18.486  < 2e-16 ***
## SocialSupportUsually                -2.027e+00  3.449e-02 -58.775  < 2e-16 ***
## raceWhite                            3.320e-01  2.667e-02  12.450  < 2e-16 ***
## income[25000-35000)                 -5.231e-02  3.503e-02  -1.493 0.135317    
## income[35000-50000)                 -1.074e-01  3.569e-02  -3.008 0.002627 ** 
## income50000 or more                 -3.245e-01  3.388e-02  -9.578  < 2e-16 ***
## incomeless than 15000                2.007e-02  2.917e-02   0.688 0.491432    
## GeneralHealthFair                    6.654e-01  4.773e-02  13.941  < 2e-16 ***
## GeneralHealthGood                    3.627e-01  4.388e-02   8.265  < 2e-16 ***
## GeneralHealthPoor                    1.112e+00  5.444e-02  20.420  < 2e-16 ***
## GeneralHealthVery good               1.949e-01  4.478e-02   4.353 1.34e-05 ***
## PoorPhysicalHealthDays               3.212e-03  1.189e-03   2.701 0.006907 ** 
## PoorMentalHealthDays                 6.160e-02  9.227e-04  66.758  < 2e-16 ***
## HealthInsuranceWith Health Coverage -2.447e-01  2.884e-02  -8.482  < 2e-16 ***
## LimitedActivityNot Limited          -4.954e-01  2.490e-02 -19.894  < 2e-16 ***
## employRetired                       -2.945e-02  5.516e-02  -0.534 0.593396    
## employSelf-Employed                  2.651e-01  6.241e-02   4.247 2.16e-05 ***
## employStudent                        2.334e-01  9.291e-02   2.512 0.012003 *  
## employUnable to work                 4.553e-01  5.455e-02   8.347  < 2e-16 ***
## employUnemployed for less than 1 yr  9.523e-01  6.275e-02  15.176  < 2e-16 ***
## employUnemployed for more than 1 yr  9.390e-01  5.947e-02  15.790  < 2e-16 ***
## employWaged Employement              1.746e-01  5.160e-02   3.383 0.000717 ***
## BMILow                               5.699e-03  2.556e-02   0.223 0.823530    
## BMIMedium                           -6.593e-02  2.487e-02  -2.651 0.008021 ** 
## HeavyDrinkerNot a Heavy Drinker     -2.847e-01  4.215e-02  -6.755 1.42e-11 ***
## CurrentSmokerNot a Current Smoker   -2.403e-01  2.359e-02 -10.183  < 2e-16 ***
## PoorSleepDays                        1.699e-02  9.506e-04  17.869  < 2e-16 ***
## marital1                            -4.783e-01  2.346e-02 -20.388  < 2e-16 ***
## PrematureMortality                  -5.024e-04  1.722e-04  -2.917 0.003531 ** 
## ParkAccess                           2.188e-03  5.332e-04   4.103 4.08e-05 ***
## CrimeRate                            1.087e-04  4.752e-05   2.287 0.022187 *  
## UnemploymentRate                     7.581e-03  4.559e-03   1.663 0.096362 .  
## WaterSafety                         -2.553e-03  7.124e-04  -3.584 0.000339 ***
## SomeCollegeRate                      7.457e-03  1.378e-03   5.411 6.25e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 115163  on 276348  degrees of freedom
## Residual deviance:  73847  on 276311  degrees of freedom
## AIC: 73923
## 
## Number of Fisher Scoring iterations: 7

Multicollinearity

Correlations for all numeric predictors

# library(corrplot)
# library(sjPlot)
rr_county.m1 <- df_final[,c(2,22, 24:27,29)]
rr_cm.m1 <- as.matrix(rr_county.m1)

corr_m1 <- cor(rr_cm.m1, use="pairwise.complete.obs")
Correlation Visualization
# correlogram
testRes.m1 <- cor.mtest(rr_cm.m1, conf.level = 0.95)
corrplot.mixed(corr_m1,
               upper = "ellipse",
               p.mat = testRes.m1$p,
               title="Correlation Matrix of Variables of Interest")

VIF for Reduced Predictors Model

vif_1 <- vif(model.1)
vif_1 <- as.data.frame(vif_1)
#create horizontal bar chart to display each VIF value
vif_plot_1 <- vif_1 %>%
  ggplot(aes(x=rownames(vif_1), y=GVIF)) +
  geom_col(color="#434343", fill="#FFB87B") +
  xlab("Statistically Significant Predictors") +
  ylab("VIF Values") +
  ggtitle("VIF Values for Reduced Model") +
  geom_hline(aes(yintercept=4),color="#434343", linetype="dashed", linewidth=0.5) +
  coord_flip()

vif_plot_1

Odds Ratios and CI of Reduced Predictors

# library(broom)
model.1 %>%
  tidy(conf.int=TRUE, exp=TRUE) %>%
  datatable(rownames=FALSE,
            colnames=c("predictor", "odds ratio", "std error", "statistic", "p-value", "CI low", "CI high"),
            options=list(pageLength=15, scrollX=T))
# forest plot
# library(finalfit)
#df_final %>%
 # or_plot(dependent, pred.m1)

Logistic Regression Using Machine Learning

Split the Data set

## set the seed to make your partition reproducible
set.seed(123)
train_index <- sample(seq_len(nrow(df_final)), size = 0.75*nrow(df_final))

train <- df_final[train_index, ]
test <- df_final[-train_index, ]
nrow(train)
## [1] 207261
nrow(test)
## [1] 69088

Train the Model

pacman::p_load(caret)
glm_model <- train(wellbeing ~ age + SocialSupport + race + income +  GeneralHealth + PoorPhysicalHealthDays +  PoorMentalHealthDays + HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays + marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + WaterSafety + SomeCollegeRate, data = train, method = "glm", family = "binomial")
attributes(glm_model)
## $names
##  [1] "method"       "modelInfo"    "modelType"    "results"      "pred"        
##  [6] "bestTune"     "call"         "dots"         "metric"       "control"     
## [11] "finalModel"   "preProcess"   "trainingData" "ptype"        "resample"    
## [16] "resampledCM"  "perfNames"    "maximize"     "yLimits"      "times"       
## [21] "levels"       "terms"        "coefnames"    "contrasts"    "xlevels"     
## 
## $class
## [1] "train"         "train.formula"

Evaluate the Model Performance

glm_model$metric
## [1] "Accuracy"
glm_model$results
##   parameter  Accuracy     Kappa   AccuracySD     KappaSD
## 1      none 0.9519952 0.3577324 0.0004168047 0.006218255

Check the Predictions on the Test Data Set

test$pred <- glm_model %>% predict(test)

evaluate the performance-define accuracy function

accuracy_function = function(real, predicted) {
  mean(real == predicted)
}
accuracy_function(real = test$wellbeing,
         predicted = test$pred)
## [1] 0.9523362

Variable Importance of the Variables Used

ggplot(varImp(glm_model))

CARET MODEL to predict Wellbeing

Library Loading

library(rattle)
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:sjstats':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:rattle':
## 
##     xgboost
## The following object is masked from 'package:dplyr':
## 
##     slice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
## 
##     importance
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:psych':
## 
##     outlier

Split the Data

set.seed(123)
train_index <- sample(seq_len(nrow(df_final)), size = 0.75*nrow(df_final))

train <- df_final[train_index, ]
test <- df_final[-train_index, ]
cat(nrow(test),nrow(train))
## 69088 207261

Holdout

levels(train$wellbeing) <- c("Good_Wellbeing", "Poor_Wellbeing")
trcntrl_ho<-trainControl(classProbs = TRUE,summaryFunction = twoClassSummary)

CART with Auto Hyperparameter Tuning

model_CART <- train(wellbeing ~ age + SocialSupport + race + income +  GeneralHealth + PoorPhysicalHealthDays +  PoorMentalHealthDays + HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays + marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + WaterSafety + SomeCollegeRate ,                 data = train, method = "rpart",trControl=trcntrl_ho,metric="ROC",tuneLength = 10)
model_CART$results
##              cp       ROC      Sens      Spec       ROCSD       SensSD
## 1  0.0004980080 0.7472576 0.9910863 0.2318332 0.022547168 0.0007790825
## 2  0.0005432814 0.7431999 0.9914031 0.2291817 0.005868611 0.0006624901
## 3  0.0006338283 0.7437871 0.9916812 0.2279633 0.005898479 0.0007834401
## 4  0.0007243752 0.7442818 0.9917585 0.2291082 0.005799676 0.0007911671
## 5  0.0013582035 0.7450477 0.9922785 0.2234589 0.005537854 0.0008667502
## 6  0.0015392974 0.7450434 0.9923658 0.2229886 0.005544318 0.0009054454
## 7  0.0020373053 0.7450245 0.9922127 0.2255714 0.005558849 0.0011004996
## 8  0.0024447664 0.7449736 0.9924919 0.2196554 0.005586875 0.0012487124
## 9  0.0143969576 0.7248273 0.9938143 0.1772921 0.067884816 0.0026939664
## 10 0.0171133647 0.6446700 0.9946944 0.1322237 0.120574485 0.0044303138
##         SpecSD
## 1  0.010431924
## 2  0.008608276
## 3  0.011127379
## 4  0.011049154
## 5  0.014123109
## 6  0.016082827
## 7  0.020373734
## 8  0.024168458
## 9  0.059100999
## 10 0.110228244
plot(model_CART)

Variable Importance of the CART Model

ggplot(varImp(model_CART))

print(varImp(model_CART))
## rpart variable importance
## 
##   only 20 most important variables shown (out of 43)
## 
##                                      Overall
## PoorMentalHealthDays                100.0000
## LimitedActivityNot Limited           71.1397
## GeneralHealthPoor                    67.0912
## employUnable to work                 60.5110
## PoorPhysicalHealthDays               55.0010
## SocialSupportUsually                 25.3124
## SocialSupportAlways                  20.0508
## marital1                             17.5696
## SocialSupport Sometimes               8.3734
## age                                   4.0841
## employRetired                         2.1720
## raceWhite                             1.9972
## PoorSleepDays                         1.9718
## PrematureMortality                    1.4345
## CrimeRate                             1.4050
## UnemploymentRate                      1.3888
## employUnemployed for less than 1 yr   1.3424
## SomeCollegeRate                       1.2201
## SocialSupportNever                    0.9717
## ParkAccess                            0.6516

Plot the Tree

plot(model_CART$finalModel, uniform=TRUE,
     main="Classification Tree")
text(model_CART$finalModel, all=FALSE, cex=.7)

# Using the Rattle Library
library(rattle)
fancyRpartPlot(model_CART$finalModel,cex = 0.9)

Prediction on Test Data

test$pred_cart <- model_CART %>% predict(test)
prob_cart<- model_CART %>% predict(test,type='prob')
confusionMtx=table(test$pred_cart,test$wellbeing)
confusionMtx
##                 
##                  Good_Wellbeing Poor_Wellbeing
##   Good_Wellbeing          64907           2890
##   Poor_Wellbeing            472            819

Create the Confusion Matrix

TP=confusionMtx[1,1]
FP=confusionMtx[1,2]
TN=confusionMtx[2,2]
FN=confusionMtx[2,1]  
TP
## [1] 64907

Calcaulating Recall

recall=TP/(TP+FN)
recall
## [1] 0.9927806

Calcualting Accuracy

Accuracy=(TP+TN)/nrow(test)
Accuracy
## [1] 0.9513374

Calacualting Specificity

specificity=TN/(TN+FP)
specificity
## [1] 0.2208142

Calculate Precision

precision=TP/(TP+FP)
precision
## [1] 0.9573727

Calculate F1-score

F1=(2*recall*precision)/(recall+precision)
F1
## [1] 0.9747552
str(prob_cart)
## 'data.frame':    69088 obs. of  2 variables:
##  $ Good_Wellbeing: num  0.973 0.973 0.973 0.973 0.973 ...
##  $ Poor_Wellbeing: num  0.0272 0.0272 0.0272 0.0272 0.0272 ...
CART.roc=roc(predictor=prob_cart$Good_Wellbeing,response=test$wellbeing,
             levels=levels(test$wellbeing))
## Setting direction: controls > cases
CART.roc
## 
## Call:
## roc.default(response = test$wellbeing, predictor = prob_cart$Good_Wellbeing,     levels = levels(test$wellbeing))
## 
## Data: prob_cart$Good_Wellbeing in 65379 controls (test$wellbeing Good_Wellbeing) > 3709 cases (test$wellbeing Poor_Wellbeing).
## Area under the curve: 0.7341
ggroc(CART.roc, alpha = 0.5, colour = "red", linetype = 2, size =0.7 )